From 4ad201a7e70d973846a39c09c137dd02dd0d3d48 Mon Sep 17 00:00:00 2001 From: Ander Juaristi Date: Tue, 22 Sep 2015 21:10:38 +0200 Subject: [PATCH] Added --convert-file-only option * src/convert.c (convert_links_in_hashtable, convert_links): test for CO_CONVERT_BASENAME_ONLY. (convert_basename): new function. * src/convert.h: new constant CO_CONVERT_BASENAME_ONLY. * src/init.c, src/main.c, src/options.h: new option "--convert-file-only". * doc/wget.texi: updated documentation. Reviewed-by: Gabriel Somlo --- doc/wget.texi | 17 +++++++++ src/convert.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++-- src/convert.h | 2 ++ src/init.c | 2 ++ src/main.c | 24 +++++++++---- src/options.h | 3 ++ 6 files changed, 136 insertions(+), 8 deletions(-) diff --git a/doc/wget.texi b/doc/wget.texi index f1244aa0..d3e42786 100644 --- a/doc/wget.texi +++ b/doc/wget.texi @@ -2123,6 +2123,23 @@ Note that only at the end of the download can Wget know which links have been downloaded. Because of that, the work done by @samp{-k} will be performed at the end of all the downloads. +@item --convert-file-only +This option converts only the filename part of the URLs, leaving the rest +of the URLs untouched. This filename part is sometimes referred to as the +"basename", although we avoid that term here in order not to cause confusion. + +It works particularly well in conjunction with @samp{--adjust-extension}, although +this coupling is not enforced. It proves useful to populate Internet caches +with files downloaded from different hosts. + +Example: if some link points to @file{//foo.com/bar.cgi?xyz} with +@samp{--adjust-extension} asserted and its local destination is intended to be +@file{./foo.com/bar.cgi?xyz.css}, then the link would be converted to +@file{//foo.com/bar.cgi?xyz.css}. Note that only the filename part has been +modified. The rest of the URL has been left untouched, including the net path +(@code{//}) which would otherwise be processed by Wget and converted to the +effective scheme (ie. @code{http://}). + @cindex backing up converted files @item -K @itemx --backup-converted diff --git a/src/convert.c b/src/convert.c index f0df9a05..8e9aa60b 100644 --- a/src/convert.c +++ b/src/convert.c @@ -46,6 +46,7 @@ as that of the covered work. */ #include "html-url.h" #include "css-url.h" #include "iri.h" +#include "xstrndup.h" static struct hash_table *dl_file_url_map; struct hash_table *dl_url_file_map; @@ -136,8 +137,9 @@ convert_links_in_hashtable (struct hash_table *downloaded_set, form. We do this even if the URL already is in relative form, because our directory structure may not be identical to that on the server (think `-nd', - `--cut-dirs', etc.) */ - cur_url->convert = CO_CONVERT_TO_RELATIVE; + `--cut-dirs', etc.). If --convert-file-only was passed, + we only convert the basename portion of the URL. */ + cur_url->convert = (opt.convert_file_only ? CO_CONVERT_BASENAME_ONLY : CO_CONVERT_TO_RELATIVE); cur_url->local_name = xstrdup (local_name); DEBUGP (("will convert url %s to local %s\n", u->url, local_name)); } @@ -206,6 +208,7 @@ static const char *replace_attr_refresh_hack (const char *, int, FILE *, const char *, int); static char *local_quote_string (const char *, bool); static char *construct_relative (const char *, const char *); +static char *convert_basename (const char *, const struct urlpos *); /* Change the links in one file. LINKS is a list of links in the document, along with their positions and the desired direction of @@ -315,9 +318,32 @@ convert_links (const char *file, struct urlpos *links) DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n", link->url->url, newname, link->pos, file)); + xfree (newname); xfree (quoted_newname); ++to_file_count; + break; + } + case CO_CONVERT_BASENAME_ONLY: + { + char *newname = convert_basename (p, link); + char *quoted_newname = local_quote_string (newname, link->link_css_p); + + if (link->link_css_p) + p = replace_plain (p, link->size, fp, quoted_newname); + else if (!link->link_refresh_p) + p = replace_attr (p, link->size, fp, quoted_newname); + else + p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname, + link->refresh_timeout); + + DEBUGP (("Converted file part only: %s to %s at position %d in %s.\n", + link->url->url, newname, link->pos, file)); + + xfree (newname); + xfree (quoted_newname); + ++to_file_count; + break; } case CO_CONVERT_TO_COMPLETE: @@ -336,6 +362,7 @@ convert_links (const char *file, struct urlpos *links) DEBUGP (("TO_COMPLETE: to %s at position %d in %s.\n", newlink, link->pos, file)); + xfree (quoted_newlink); ++to_url_count; break; @@ -422,6 +449,71 @@ construct_relative (const char *basefile, const char *linkfile) return link; } +/* Construct and return a "transparent proxy" URL + reflecting changes made by --adjust-extension to the file component + (i.e., "basename") of the original URL, but leaving the "dirname" + of the URL (protocol://hostname... portion) untouched. + + Think: populating a squid cache via a recursive wget scrape, where + changing URLs to work locally with "file://..." is NOT desirable. + + Example: + + if + p = "//foo.com/bar.cgi?xyz" + and + link->local_name = "docroot/foo.com/bar.cgi?xyz.css" + then + + new_construct_func(p, link); + will return + "//foo.com/bar.cgi?xyz.css" + + Essentially, we do s/$(basename orig_url)/$(basename link->local_name)/ +*/ +static char * +convert_basename (const char *p, const struct urlpos *link) +{ + int len = link->size; + char *url = NULL; + char *org_basename = NULL, *local_basename = NULL; + char *result = NULL; + + if (*p == '"' || *p == '\'') + { + len -= 2; + p++; + } + + url = xstrndup (p, len); + + org_basename = strrchr (url, '/'); + if (org_basename) + org_basename++; + else + org_basename = url; + + local_basename = strrchr (link->local_name, '/'); + if (local_basename) + local_basename++; + else + local_basename = url; + + /* + * If the basenames differ, graft the adjusted basename (local_basename) + * onto the original URL. + */ + if (strcmp (org_basename, local_basename) == 0) + result = url; + else + { + result = uri_merge (url, local_basename); + xfree (url); + } + + return result; +} + /* Used by write_backup_file to remember which files have been written. */ static struct hash_table *converted_files; diff --git a/src/convert.h b/src/convert.h index 3105db18..b3cd196e 100644 --- a/src/convert.h +++ b/src/convert.h @@ -40,6 +40,8 @@ enum convert_options { CO_NOCONVERT = 0, /* don't convert this URL */ CO_CONVERT_TO_RELATIVE, /* convert to relative, e.g. to "../../otherdir/foo.gif" */ + CO_CONVERT_BASENAME_ONLY, /* convert the file portion only (basename) + leaving the rest of the URL unchanged */ CO_CONVERT_TO_COMPLETE, /* convert to absolute, e.g. to "http://orighost/somedir/bar.jpg". */ CO_NULLIFY_BASE /* change to empty string. */ diff --git a/src/init.c b/src/init.c index dd1506ce..67c94b94 100644 --- a/src/init.c +++ b/src/init.c @@ -159,6 +159,7 @@ static const struct { { "contentdisposition", &opt.content_disposition, cmd_boolean }, { "contentonerror", &opt.content_on_error, cmd_boolean }, { "continue", &opt.always_rest, cmd_boolean }, + { "convertfileonly", &opt.convert_file_only, cmd_boolean }, { "convertlinks", &opt.convert_links, cmd_boolean }, { "cookies", &opt.cookies, cmd_boolean }, #ifdef HAVE_SSL @@ -377,6 +378,7 @@ defaults (void) opt.htmlify = true; opt.http_keep_alive = true; opt.use_proxy = true; + opt.convert_file_only = false; tmp = getenv ("no_proxy"); if (tmp) opt.no_proxy = sepstring (tmp); diff --git a/src/main.c b/src/main.c index 92d1bee2..cfdc361a 100644 --- a/src/main.c +++ b/src/main.c @@ -264,6 +264,7 @@ static struct cmdline_option option_data[] = { "config", 0, OPT_VALUE, "chooseconfig", -1 }, { "connect-timeout", 0, OPT_VALUE, "connecttimeout", -1 }, { "continue", 'c', OPT_BOOLEAN, "continue", -1 }, + { "convert-file-only", 0, OPT_BOOLEAN, "convertfileonly", -1 }, { "convert-links", 'k', OPT_BOOLEAN, "convertlinks", -1 }, { "content-disposition", 0, OPT_BOOLEAN, "contentdisposition", -1 }, { "content-on-error", 0, OPT_BOOLEAN, "contentonerror", -1 }, @@ -876,6 +877,8 @@ Recursive download:\n"), N_("\ -k, --convert-links make links in downloaded HTML or CSS point to\n\ local files\n"), + N_("\ + --convert-file-only convert the file part of the URLs only (usually known as the basename)\n"), N_("\ --backups=N before writing file X, rotate up to N backup files\n"), @@ -1387,11 +1390,14 @@ main (int argc, char **argv) /* All user options have now been processed, so it's now safe to do interoption dependency checks. */ - if (opt.noclobber && opt.convert_links) + if (opt.noclobber && (opt.convert_links || opt.convert_file_only)) { fprintf (stderr, - _("Both --no-clobber and --convert-links were specified," - " only --convert-links will be used.\n")); + opt.convert_links ? + _("Both --no-clobber and --convert-links were specified," + " only --convert-links will be used.\n") : + _("Both --no-clobber and --convert-file-only were specified," + " only --convert-file-only will be used.\n")); opt.noclobber = false; } @@ -1445,11 +1451,11 @@ Can't timestamp and not clobber old files at the same time.\n")); #endif if (opt.output_document) { - if (opt.convert_links + if ((opt.convert_links || opt.convert_file_only) && (nurl > 1 || opt.page_requisites || opt.recursive)) { fputs (_("\ -Cannot specify both -k and -O if multiple URLs are given, or in combination\n\ +Cannot specify both -k or --convert-file-only and -O if multiple URLs are given, or in combination\n\ with -p or -r. See the manual for details.\n\n"), stderr); print_usage (1); exit (WGET_EXIT_GENERIC_ERROR); @@ -1760,6 +1766,12 @@ for details.\n\n")); outputting to a regular file.\n")); exit (WGET_EXIT_GENERIC_ERROR); } + if (!output_stream_regular && (opt.convert_links || opt.convert_file_only)) + { + fprintf (stderr, _("--convert-links or --convert-file-only can be used together \ +only if outputting to a regular file.\n")); + exit (WGET_EXIT_GENERIC_ERROR); + } } #ifdef __VMS @@ -1970,7 +1982,7 @@ outputting to a regular file.\n")); save_hsts (); #endif - if (opt.convert_links && !opt.delete_after) + if ((opt.convert_links || opt.convert_file_only) && !opt.delete_after) convert_all_links (); cleanup (); diff --git a/src/options.h b/src/options.h index 0bb7d715..dad08c1c 100644 --- a/src/options.h +++ b/src/options.h @@ -182,6 +182,9 @@ struct options NULL. */ bool convert_links; /* Will the links be converted locally? */ + bool convert_file_only; /* Convert only the file portion of the URI (i.e. basename). + Leave everything else untouched. */ + bool remove_listing; /* Do we remove .listing files generated by FTP? */ bool htmlify; /* Do we HTML-ify the OS-dependent