Added --convert-file-only option

* src/convert.c (convert_links_in_hashtable, convert_links):
   test for CO_CONVERT_BASENAME_ONLY.
   (convert_basename): new function.
 * src/convert.h: new constant CO_CONVERT_BASENAME_ONLY.
 * src/init.c, src/main.c, src/options.h: new option "--convert-file-only".
 * doc/wget.texi: updated documentation.

 Reviewed-by: Gabriel Somlo <somlo@cmu.edu>
This commit is contained in:
Ander Juaristi 2015-09-22 21:10:38 +02:00 committed by Tim Rühsen
parent 3608c3001c
commit 4ad201a7e7
6 changed files with 136 additions and 8 deletions

View File

@ -2123,6 +2123,23 @@ Note that only at the end of the download can Wget know which links have
been downloaded. Because of that, the work done by @samp{-k} will be
performed at the end of all the downloads.
@item --convert-file-only
This option converts only the filename part of the URLs, leaving the rest
of the URLs untouched. This filename part is sometimes referred to as the
"basename", although we avoid that term here in order not to cause confusion.
It works particularly well in conjunction with @samp{--adjust-extension}, although
this coupling is not enforced. It proves useful to populate Internet caches
with files downloaded from different hosts.
Example: if some link points to @file{//foo.com/bar.cgi?xyz} with
@samp{--adjust-extension} asserted and its local destination is intended to be
@file{./foo.com/bar.cgi?xyz.css}, then the link would be converted to
@file{//foo.com/bar.cgi?xyz.css}. Note that only the filename part has been
modified. The rest of the URL has been left untouched, including the net path
(@code{//}) which would otherwise be processed by Wget and converted to the
effective scheme (ie. @code{http://}).
@cindex backing up converted files
@item -K
@itemx --backup-converted

View File

@ -46,6 +46,7 @@ as that of the covered work. */
#include "html-url.h"
#include "css-url.h"
#include "iri.h"
#include "xstrndup.h"
static struct hash_table *dl_file_url_map;
struct hash_table *dl_url_file_map;
@ -136,8 +137,9 @@ convert_links_in_hashtable (struct hash_table *downloaded_set,
form. We do this even if the URL already is in
relative form, because our directory structure may
not be identical to that on the server (think `-nd',
`--cut-dirs', etc.) */
cur_url->convert = CO_CONVERT_TO_RELATIVE;
`--cut-dirs', etc.). If --convert-file-only was passed,
we only convert the basename portion of the URL. */
cur_url->convert = (opt.convert_file_only ? CO_CONVERT_BASENAME_ONLY : CO_CONVERT_TO_RELATIVE);
cur_url->local_name = xstrdup (local_name);
DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
}
@ -206,6 +208,7 @@ static const char *replace_attr_refresh_hack (const char *, int, FILE *,
const char *, int);
static char *local_quote_string (const char *, bool);
static char *construct_relative (const char *, const char *);
static char *convert_basename (const char *, const struct urlpos *);
/* Change the links in one file. LINKS is a list of links in the
document, along with their positions and the desired direction of
@ -315,9 +318,32 @@ convert_links (const char *file, struct urlpos *links)
DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
link->url->url, newname, link->pos, file));
xfree (newname);
xfree (quoted_newname);
++to_file_count;
break;
}
case CO_CONVERT_BASENAME_ONLY:
{
char *newname = convert_basename (p, link);
char *quoted_newname = local_quote_string (newname, link->link_css_p);
if (link->link_css_p)
p = replace_plain (p, link->size, fp, quoted_newname);
else if (!link->link_refresh_p)
p = replace_attr (p, link->size, fp, quoted_newname);
else
p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
link->refresh_timeout);
DEBUGP (("Converted file part only: %s to %s at position %d in %s.\n",
link->url->url, newname, link->pos, file));
xfree (newname);
xfree (quoted_newname);
++to_file_count;
break;
}
case CO_CONVERT_TO_COMPLETE:
@ -336,6 +362,7 @@ convert_links (const char *file, struct urlpos *links)
DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
newlink, link->pos, file));
xfree (quoted_newlink);
++to_url_count;
break;
@ -422,6 +449,71 @@ construct_relative (const char *basefile, const char *linkfile)
return link;
}
/* Construct and return a "transparent proxy" URL
reflecting changes made by --adjust-extension to the file component
(i.e., "basename") of the original URL, but leaving the "dirname"
of the URL (protocol://hostname... portion) untouched.
Think: populating a squid cache via a recursive wget scrape, where
changing URLs to work locally with "file://..." is NOT desirable.
Example:
if
p = "//foo.com/bar.cgi?xyz"
and
link->local_name = "docroot/foo.com/bar.cgi?xyz.css"
then
new_construct_func(p, link);
will return
"//foo.com/bar.cgi?xyz.css"
Essentially, we do s/$(basename orig_url)/$(basename link->local_name)/
*/
static char *
convert_basename (const char *p, const struct urlpos *link)
{
int len = link->size;
char *url = NULL;
char *org_basename = NULL, *local_basename = NULL;
char *result = NULL;
if (*p == '"' || *p == '\'')
{
len -= 2;
p++;
}
url = xstrndup (p, len);
org_basename = strrchr (url, '/');
if (org_basename)
org_basename++;
else
org_basename = url;
local_basename = strrchr (link->local_name, '/');
if (local_basename)
local_basename++;
else
local_basename = url;
/*
* If the basenames differ, graft the adjusted basename (local_basename)
* onto the original URL.
*/
if (strcmp (org_basename, local_basename) == 0)
result = url;
else
{
result = uri_merge (url, local_basename);
xfree (url);
}
return result;
}
/* Used by write_backup_file to remember which files have been
written. */
static struct hash_table *converted_files;

View File

@ -40,6 +40,8 @@ enum convert_options {
CO_NOCONVERT = 0, /* don't convert this URL */
CO_CONVERT_TO_RELATIVE, /* convert to relative, e.g. to
"../../otherdir/foo.gif" */
CO_CONVERT_BASENAME_ONLY, /* convert the file portion only (basename)
leaving the rest of the URL unchanged */
CO_CONVERT_TO_COMPLETE, /* convert to absolute, e.g. to
"http://orighost/somedir/bar.jpg". */
CO_NULLIFY_BASE /* change to empty string. */

View File

@ -159,6 +159,7 @@ static const struct {
{ "contentdisposition", &opt.content_disposition, cmd_boolean },
{ "contentonerror", &opt.content_on_error, cmd_boolean },
{ "continue", &opt.always_rest, cmd_boolean },
{ "convertfileonly", &opt.convert_file_only, cmd_boolean },
{ "convertlinks", &opt.convert_links, cmd_boolean },
{ "cookies", &opt.cookies, cmd_boolean },
#ifdef HAVE_SSL
@ -377,6 +378,7 @@ defaults (void)
opt.htmlify = true;
opt.http_keep_alive = true;
opt.use_proxy = true;
opt.convert_file_only = false;
tmp = getenv ("no_proxy");
if (tmp)
opt.no_proxy = sepstring (tmp);

View File

@ -264,6 +264,7 @@ static struct cmdline_option option_data[] =
{ "config", 0, OPT_VALUE, "chooseconfig", -1 },
{ "connect-timeout", 0, OPT_VALUE, "connecttimeout", -1 },
{ "continue", 'c', OPT_BOOLEAN, "continue", -1 },
{ "convert-file-only", 0, OPT_BOOLEAN, "convertfileonly", -1 },
{ "convert-links", 'k', OPT_BOOLEAN, "convertlinks", -1 },
{ "content-disposition", 0, OPT_BOOLEAN, "contentdisposition", -1 },
{ "content-on-error", 0, OPT_BOOLEAN, "contentonerror", -1 },
@ -876,6 +877,8 @@ Recursive download:\n"),
N_("\
-k, --convert-links make links in downloaded HTML or CSS point to\n\
local files\n"),
N_("\
--convert-file-only convert the file part of the URLs only (usually known as the basename)\n"),
N_("\
--backups=N before writing file X, rotate up to N backup files\n"),
@ -1387,11 +1390,14 @@ main (int argc, char **argv)
/* All user options have now been processed, so it's now safe to do
interoption dependency checks. */
if (opt.noclobber && opt.convert_links)
if (opt.noclobber && (opt.convert_links || opt.convert_file_only))
{
fprintf (stderr,
_("Both --no-clobber and --convert-links were specified,"
" only --convert-links will be used.\n"));
opt.convert_links ?
_("Both --no-clobber and --convert-links were specified,"
" only --convert-links will be used.\n") :
_("Both --no-clobber and --convert-file-only were specified,"
" only --convert-file-only will be used.\n"));
opt.noclobber = false;
}
@ -1445,11 +1451,11 @@ Can't timestamp and not clobber old files at the same time.\n"));
#endif
if (opt.output_document)
{
if (opt.convert_links
if ((opt.convert_links || opt.convert_file_only)
&& (nurl > 1 || opt.page_requisites || opt.recursive))
{
fputs (_("\
Cannot specify both -k and -O if multiple URLs are given, or in combination\n\
Cannot specify both -k or --convert-file-only and -O if multiple URLs are given, or in combination\n\
with -p or -r. See the manual for details.\n\n"), stderr);
print_usage (1);
exit (WGET_EXIT_GENERIC_ERROR);
@ -1760,6 +1766,12 @@ for details.\n\n"));
outputting to a regular file.\n"));
exit (WGET_EXIT_GENERIC_ERROR);
}
if (!output_stream_regular && (opt.convert_links || opt.convert_file_only))
{
fprintf (stderr, _("--convert-links or --convert-file-only can be used together \
only if outputting to a regular file.\n"));
exit (WGET_EXIT_GENERIC_ERROR);
}
}
#ifdef __VMS
@ -1970,7 +1982,7 @@ outputting to a regular file.\n"));
save_hsts ();
#endif
if (opt.convert_links && !opt.delete_after)
if ((opt.convert_links || opt.convert_file_only) && !opt.delete_after)
convert_all_links ();
cleanup ();

View File

@ -182,6 +182,9 @@ struct options
NULL. */
bool convert_links; /* Will the links be converted
locally? */
bool convert_file_only; /* Convert only the file portion of the URI (i.e. basename).
Leave everything else untouched. */
bool remove_listing; /* Do we remove .listing files
generated by FTP? */
bool htmlify; /* Do we HTML-ify the OS-dependent