From 8a8d138dcc95b1248b66fb3dd64a1849b6da6ff5 Mon Sep 17 00:00:00 2001 From: Hubert Tarasiuk Date: Sat, 9 May 2015 22:47:24 +0200 Subject: [PATCH] Support If-Modified-Since header in timestamping mode. * src/wget.h: Add IF_MODIFIED_SINCE enum for dt. Add TIMECONV_ERR enum to uerr_t. * src/http.c (time_to_rfc1123): Convert time_t do http time. * src/http.c (initialize_request): Include If-Modified-Since header if appropriate. * src/http.c (set_file_timestamp): Separate this code from check_file_output. * src/http.c (check_file_output): Use set_file_timestamp. * src/http.c (gethttp): Handle properly 304 return code and 200 if server ignores If-Modified-Since headers. * src/http.c (http_loop): Load filename to hstat if condget was requested, use IF_MODIFIED_SINCE if requested and current timestamp can be obtained. --- src/http.c | 250 ++++++++++++++++++++++++++++++++++++++++------------- src/wget.h | 6 +- 2 files changed, 194 insertions(+), 62 deletions(-) diff --git a/src/http.c b/src/http.c index 54eb106a..777903bf 100644 --- a/src/http.c +++ b/src/http.c @@ -1681,6 +1681,58 @@ read_response_body (struct http_stat *hs, int sock, FILE *fp, wgint contlen, } while (0) #endif /* def __VMS [else] */ +/* + Convert time_t to one of valid HTTP date formats + ie. rfc1123-date. + + HTTP-date = rfc1123-date | rfc850-date | asctime-date + rfc1123-date = wkday "," SP date1 SP time SP "GMT" + rfc850-date = weekday "," SP date2 SP time SP "GMT" + asctime-date = wkday SP date3 SP time SP 4DIGIT + date1 = 2DIGIT SP month SP 4DIGIT + ; day month year (e.g., 02 Jun 1982) + date2 = 2DIGIT "-" month "-" 2DIGIT + ; day-month-year (e.g., 02-Jun-82) + date3 = month SP ( 2DIGIT | ( SP 1DIGIT )) + ; month day (e.g., Jun 2) + time = 2DIGIT ":" 2DIGIT ":" 2DIGIT + ; 00:00:00 - 23:59:59 + wkday = "Mon" | "Tue" | "Wed" + | "Thu" | "Fri" | "Sat" | "Sun" + weekday = "Monday" | "Tuesday" | "Wednesday" + | "Thursday" | "Friday" | "Saturday" | "Sunday" + month = "Jan" | "Feb" | "Mar" | "Apr" + | "May" | "Jun" | "Jul" | "Aug" + | "Sep" | "Oct" | "Nov" | "Dec" + + source: RFC2616 */ +static uerr_t +time_to_rfc1123 (time_t time, char *buf, size_t bufsize) +{ + static const char *wkday[] = { "Sun", "Mon", "Tue", "Wed", + "Thu", "Fri", "Sat" }; + static const char *month[] = { "Jan", "Feb", "Mar", "Apr", + "May", "Jun", "Jul", "Aug", + "Sep", "Oct", "Nov", "Dec" }; + /* rfc1123 example: Thu, 01 Jan 1998 22:12:57 GMT */ + static const char *time_format = "%s, %02d %s %04d %02d:%02d:%02d GMT"; + + struct tm *gtm = gmtime (&time); + if (!gtm) + { + logprintf (LOG_NOTQUIET, + _("gmtime failed. This is probably a bug.\n")); + return TIMECONV_ERR; + } + + snprintf (buf, bufsize, time_format, wkday[gtm->tm_wday], + gtm->tm_mday, month[gtm->tm_mon], + gtm->tm_year + 1900, gtm->tm_hour, + gtm->tm_min, gtm->tm_sec); + + return RETROK; +} + static struct request * initialize_request (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, bool inhibit_keep_alive, bool *basic_auth_finished, @@ -1723,6 +1775,20 @@ initialize_request (struct url *u, struct http_stat *hs, int *dt, struct url *pr /* ... but some HTTP/1.0 caches doesn't implement Cache-Control. */ request_set_header (req, "Pragma", "no-cache", rel_none); } + if (*dt & IF_MODIFIED_SINCE) + { + char strtime[32]; + uerr_t err = time_to_rfc1123 (hs->orig_file_tstamp, strtime, countof (strtime)); + + if (err != RETROK) + { + logputs (LOG_VERBOSE, _("Cannot convert timestamp to http format. " + "Falling back to time 0 as last modification " + "time.\n")); + strcpy (strtime, "Thu, 01 Jan 1970 00:00:00 GMT"); + } + request_set_header (req, "If-Modified-Since", xstrdup (strtime), rel_value); + } if (hs->restval) request_set_header (req, "Range", aprintf ("bytes=%s-", @@ -2024,6 +2090,69 @@ establish_connection (struct url *u, struct url **conn_ref, return RETROK; } +static uerr_t +set_file_timestamp (struct http_stat *hs) +{ + size_t filename_len = strlen (hs->local_file); + char *filename_plus_orig_suffix = alloca (filename_len + sizeof (ORIG_SFX)); + bool local_dot_orig_file_exists = false; + char *local_filename = NULL; + struct_stat st; + + if (opt.backup_converted) + /* If -K is specified, we'll act on the assumption that it was specified + last time these files were downloaded as well, and instead of just + comparing local file X against server file X, we'll compare local + file X.orig (if extant, else X) against server file X. If -K + _wasn't_ specified last time, or the server contains files called + *.orig, -N will be back to not operating correctly with -k. */ + { + /* Would a single s[n]printf() call be faster? --dan + + Definitely not. sprintf() is horribly slow. It's a + different question whether the difference between the two + affects a program. Usually I'd say "no", but at one + point I profiled Wget, and found that a measurable and + non-negligible amount of time was lost calling sprintf() + in url.c. Replacing sprintf with inline calls to + strcpy() and number_to_string() made a difference. + --hniksic */ + memcpy (filename_plus_orig_suffix, hs->local_file, filename_len); + memcpy (filename_plus_orig_suffix + filename_len, + ORIG_SFX, sizeof (ORIG_SFX)); + + /* Try to stat() the .orig file. */ + if (stat (filename_plus_orig_suffix, &st) == 0) + { + local_dot_orig_file_exists = true; + local_filename = filename_plus_orig_suffix; + } + } + + if (!local_dot_orig_file_exists) + /* Couldn't stat() .orig, so try to stat() . */ + if (stat (hs->local_file, &st) == 0) + local_filename = hs->local_file; + + if (local_filename != NULL) + /* There was a local file, so we'll check later to see if the version + the server has is the same version we already have, allowing us to + skip a download. */ + { + hs->orig_file_name = xstrdup (local_filename); + hs->orig_file_size = st.st_size; + hs->orig_file_tstamp = st.st_mtime; +#ifdef WINDOWS + /* Modification time granularity is 2 seconds for Windows, so + increase local time by 1 second for later comparison. */ + ++hs->orig_file_tstamp; +#endif + hs->timestamp_checked = true; + } + + return RETROK; +} + static uerr_t check_file_output (struct url *u, struct http_stat *hs, struct response *resp, char *hdrval, size_t hdrsize) @@ -2077,61 +2206,9 @@ check_file_output (struct url *u, struct http_stat *hs, /* Support timestamping */ if (opt.timestamping && !hs->timestamp_checked) { - size_t filename_len = strlen (hs->local_file); - char *filename_plus_orig_suffix = alloca (filename_len + sizeof (ORIG_SFX)); - bool local_dot_orig_file_exists = false; - char *local_filename = NULL; - struct_stat st; - - if (opt.backup_converted) - /* If -K is specified, we'll act on the assumption that it was specified - last time these files were downloaded as well, and instead of just - comparing local file X against server file X, we'll compare local - file X.orig (if extant, else X) against server file X. If -K - _wasn't_ specified last time, or the server contains files called - *.orig, -N will be back to not operating correctly with -k. */ - { - /* Would a single s[n]printf() call be faster? --dan - - Definitely not. sprintf() is horribly slow. It's a - different question whether the difference between the two - affects a program. Usually I'd say "no", but at one - point I profiled Wget, and found that a measurable and - non-negligible amount of time was lost calling sprintf() - in url.c. Replacing sprintf with inline calls to - strcpy() and number_to_string() made a difference. - --hniksic */ - memcpy (filename_plus_orig_suffix, hs->local_file, filename_len); - memcpy (filename_plus_orig_suffix + filename_len, - ORIG_SFX, sizeof (ORIG_SFX)); - - /* Try to stat() the .orig file. */ - if (stat (filename_plus_orig_suffix, &st) == 0) - { - local_dot_orig_file_exists = true; - local_filename = filename_plus_orig_suffix; - } - } - - if (!local_dot_orig_file_exists) - /* Couldn't stat() .orig, so try to stat() . */ - if (stat (hs->local_file, &st) == 0) - local_filename = hs->local_file; - - if (local_filename != NULL) - /* There was a local file, so we'll check later to see if the version - the server has is the same version we already have, allowing us to - skip a download. */ - { - hs->orig_file_name = xstrdup (local_filename); - hs->orig_file_size = st.st_size; - hs->orig_file_tstamp = st.st_mtime; -#ifdef WINDOWS - /* Modification time granularity is 2 seconds for Windows, so - increase local time by 1 second for later comparison. */ - ++hs->orig_file_tstamp; -#endif - } + uerr_t timestamp_err = set_file_timestamp (hs); + if (timestamp_err != RETROK) + return timestamp_err; } return RETROK; } @@ -2421,6 +2498,9 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, POST). */ bool head_only = !!(*dt & HEAD_ONLY); + /* Whether conditional get request will be issued. */ + bool cond_get = !!(*dt & IF_MODIFIED_SINCE); + char *head = NULL; struct response *resp = NULL; char hdrval[512]; @@ -3020,6 +3100,41 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, } } + if (cond_get) + { + if (statcode == HTTP_STATUS_NOT_MODIFIED) + { + logprintf (LOG_VERBOSE, + _("File %s not modified on server. Omitting download.\n\n"), + quote (hs->local_file)); + *dt |= RETROKF; + CLOSE_FINISH (sock); + retval = RETRUNNEEDED; + goto cleanup; + } + /* Handle the case when server ignores If-Modified-Since header. */ + else if (statcode == HTTP_STATUS_OK && hs->remote_time) + { + time_t tmr = http_atotm (hs->remote_time); + + /* Check if the local file is up-to-date based on Last-Modified header + and content length. */ + if (tmr != (time_t) - 1 && tmr <= hs->orig_file_tstamp + && (contlen == -1 || contlen == hs->orig_file_size)) + { + logprintf (LOG_VERBOSE, + _("Server ignored If-Modified-Since header for file %s.\n" + "You might want to add --no-if-modified-since option." + "\n\n"), + quote (hs->local_file)); + *dt |= RETROKF; + CLOSE_INVALIDATE (sock); + retval = RETRUNNEEDED; + goto cleanup; + } + } + } + if (statcode == HTTP_STATUS_RANGE_NOT_SATISFIABLE || (!opt.timestamping && hs->restval > 0 && statcode == HTTP_STATUS_OK && contrange == 0 && contlen >= 0 && hs->restval >= contlen)) @@ -3263,15 +3378,30 @@ http_loop (struct url *u, struct url *original_url, char **newloc, if (opt.content_disposition && opt.always_rest) send_head_first = true; - /* Send preliminary HEAD request if -N is given and we have an existing - * destination file. */ if (!opt.output_document) file_name = url_file_name (opt.trustservernames ? u : original_url, NULL); else file_name = xstrdup (opt.output_document); - if (opt.timestamping && (file_exists_p (file_name) - || opt.content_disposition)) - send_head_first = true; + + if (opt.timestamping) + { + /* Use conditional get request if requested + * and if timestamp is known at this moment. */ + if (opt.if_modified_since && file_exists_p (file_name) && !send_head_first) + { + *dt |= IF_MODIFIED_SINCE; + { + uerr_t timestamp_err = set_file_timestamp (&hstat); + if (timestamp_err != RETROK) + return timestamp_err; + } + } + /* Send preliminary HEAD request if -N is given and we have existing + * destination file or content disposition is enabled. */ + else if (file_exists_p (file_name) || opt.content_disposition) + send_head_first = true; + } + xfree (file_name); /* THE loop */ diff --git a/src/wget.h b/src/wget.h index 8d2b0f1a..2c317131 100644 --- a/src/wget.h +++ b/src/wget.h @@ -331,7 +331,8 @@ enum SEND_NOCACHE = 0x0008, /* send Pragma: no-cache directive */ ACCEPTRANGES = 0x0010, /* Accept-ranges header was found */ ADDED_HTML_EXTENSION = 0x0020, /* added ".html" extension due to -E */ - TEXTCSS = 0x0040 /* document is of type text/css */ + TEXTCSS = 0x0040, /* document is of type text/css */ + IF_MODIFIED_SINCE = 0x0080 /* use if-modified-since header */ }; /* Universal error type -- used almost everywhere. Error reporting of @@ -351,7 +352,8 @@ typedef enum RETRBADPATTERN, PROXERR, AUTHFAILED, QUOTEXC, WRITEFAILED, SSLINITFAILED, VERIFCERTERR, UNLINKERR, NEWLOCATION_KEEP_POST, CLOSEFAILED, ATTRMISSING, UNKNOWNATTR, - WARC_ERR, WARC_TMP_FOPENERR, WARC_TMP_FWRITEERR + WARC_ERR, WARC_TMP_FOPENERR, WARC_TMP_FWRITEERR, + TIMECONV_ERR } uerr_t; /* 2005-02-19 SMS.