mirror of
https://github.com/mirror/wget.git
synced 2025-03-13 11:20:19 +08:00
Support continious reading from stdin pipes
Rather than reading from stdin only once, leave the pipe open until the other end closes it and keep reading from the file after each set of URLs is read * src/html-url.h(get_urls_file): Update prototype to add additional param * src/html-url.c(get_urls_file): Pass through read_again to wget_read_from_file. * src/retr.c(retrieve_from_file): Split the function into two. Introduce `retrieve_from_url_list` that actually performs the retrieval. Also, if `url_list` returns that the fd has been left open, then continue reading from it until the fd is closed. (retrieve_from_url_list): New function that does the retrieval from a list of URLs that was read from a file. * src/utils.c(wget_read_from_file): Rename old function `wget_read_file` to this. Accept an additional output parameter that states whether the fd was left open and if we should continue reading from it after the current set of URLs have been processed (wget_read_file): Write it as a new wrapper function around `wget_read_from_file` to maintain API comptability across other users
This commit is contained in:
parent
ca10f20aaf
commit
196ce0abd6
@ -874,20 +874,21 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
|
||||
to get_urls_html, so we put it here. */
|
||||
|
||||
struct urlpos *
|
||||
get_urls_file (const char *file)
|
||||
get_urls_file (const char *file, bool *read_again)
|
||||
{
|
||||
struct file_memory *fm;
|
||||
struct urlpos *head, *tail;
|
||||
const char *text, *text_end;
|
||||
|
||||
/* Load the file. */
|
||||
fm = wget_read_file (file);
|
||||
fm = wget_read_from_file (file, read_again);
|
||||
if (!fm)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
|
||||
return NULL;
|
||||
}
|
||||
DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
|
||||
if (fm->length)
|
||||
DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
|
||||
|
||||
head = tail = NULL;
|
||||
text = fm->content;
|
||||
|
@ -48,7 +48,7 @@ struct map_context {
|
||||
struct urlpos *head; /* List of URLs that is being built. */
|
||||
};
|
||||
|
||||
struct urlpos *get_urls_file (const char *);
|
||||
struct urlpos *get_urls_file (const char *, bool *);
|
||||
struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *);
|
||||
struct urlpos *get_urls_html_fm (const char *, const struct file_memory *, const char *, bool *, struct iri *);
|
||||
struct urlpos *append_url (const char *, int, int, struct map_context *);
|
||||
|
142
src/retr.c
142
src/retr.c
@ -1177,73 +1177,10 @@ bail:
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Find the URLs in the file and call retrieve_url() for each of them.
|
||||
If HTML is true, treat the file as HTML, and construct the URLs
|
||||
accordingly.
|
||||
|
||||
If opt.recursive is set, call retrieve_tree() for each file. */
|
||||
|
||||
uerr_t
|
||||
retrieve_from_file (const char *file, bool html, int *count)
|
||||
static uerr_t retrieve_from_url_list(struct urlpos *url_list, int *count, struct iri *iri)
|
||||
{
|
||||
struct urlpos *cur_url;
|
||||
uerr_t status;
|
||||
struct urlpos *url_list, *cur_url;
|
||||
struct iri *iri = iri_new();
|
||||
|
||||
char *input_file, *url_file = NULL;
|
||||
const char *url = file;
|
||||
|
||||
status = RETROK; /* Suppose everything is OK. */
|
||||
*count = 0; /* Reset the URL count. */
|
||||
|
||||
/* sXXXav : Assume filename and links in the file are in the locale */
|
||||
set_uri_encoding (iri, opt.locale, true);
|
||||
set_content_encoding (iri, opt.locale);
|
||||
|
||||
if (url_valid_scheme (url))
|
||||
{
|
||||
int dt,url_err;
|
||||
struct url *url_parsed = url_parse (url, &url_err, iri, true);
|
||||
if (!url_parsed)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (url_err));
|
||||
iri_free (iri);
|
||||
return URLERROR;
|
||||
}
|
||||
|
||||
if (!opt.base_href)
|
||||
opt.base_href = xstrdup (url);
|
||||
|
||||
status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt,
|
||||
false, iri, true);
|
||||
url_free (url_parsed);
|
||||
|
||||
if (!url_file || (status != RETROK))
|
||||
return status;
|
||||
|
||||
if (dt & TEXTHTML)
|
||||
html = true;
|
||||
|
||||
#ifdef ENABLE_IRI
|
||||
/* If we have a found a content encoding, use it.
|
||||
* ( == is okay, because we're checking for identical object) */
|
||||
if (iri->content_encoding != opt.locale)
|
||||
set_uri_encoding (iri, iri->content_encoding, false);
|
||||
#endif
|
||||
|
||||
/* Reset UTF-8 encode status */
|
||||
iri->utf8_encode = opt.enable_iri;
|
||||
xfree (iri->orig_url);
|
||||
|
||||
input_file = url_file;
|
||||
}
|
||||
else
|
||||
input_file = (char *) file;
|
||||
|
||||
url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
|
||||
: get_urls_file (input_file));
|
||||
|
||||
xfree (url_file);
|
||||
|
||||
for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
|
||||
{
|
||||
@ -1310,6 +1247,81 @@ Removing file due to --delete-after in retrieve_from_file():\n"));
|
||||
xfree (filename);
|
||||
iri_free (tmpiri);
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
/* Find the URLs in the file and call retrieve_url() for each of them.
|
||||
If HTML is true, treat the file as HTML, and construct the URLs
|
||||
accordingly.
|
||||
|
||||
If opt.recursive is set, call retrieve_tree() for each file. */
|
||||
|
||||
uerr_t
|
||||
retrieve_from_file (const char *file, bool html, int *count)
|
||||
{
|
||||
uerr_t status;
|
||||
struct urlpos *url_list, *cur_url;
|
||||
struct iri *iri = iri_new();
|
||||
|
||||
char *input_file, *url_file = NULL;
|
||||
const char *url = file;
|
||||
|
||||
status = RETROK; /* Suppose everything is OK. */
|
||||
*count = 0; /* Reset the URL count. */
|
||||
|
||||
/* sXXXav : Assume filename and links in the file are in the locale */
|
||||
set_uri_encoding (iri, opt.locale, true);
|
||||
set_content_encoding (iri, opt.locale);
|
||||
|
||||
if (url_valid_scheme (url))
|
||||
{
|
||||
int dt,url_err;
|
||||
struct url *url_parsed = url_parse (url, &url_err, iri, true);
|
||||
if (!url_parsed)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (url_err));
|
||||
iri_free (iri);
|
||||
return URLERROR;
|
||||
}
|
||||
|
||||
if (!opt.base_href)
|
||||
opt.base_href = xstrdup (url);
|
||||
|
||||
status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt,
|
||||
false, iri, true);
|
||||
url_free (url_parsed);
|
||||
|
||||
if (!url_file || (status != RETROK))
|
||||
return status;
|
||||
|
||||
if (dt & TEXTHTML)
|
||||
html = true;
|
||||
|
||||
#ifdef ENABLE_IRI
|
||||
/* If we have a found a content encoding, use it.
|
||||
* ( == is okay, because we're checking for identical object) */
|
||||
if (iri->content_encoding != opt.locale)
|
||||
set_uri_encoding (iri, iri->content_encoding, false);
|
||||
#endif
|
||||
|
||||
/* Reset UTF-8 encode status */
|
||||
iri->utf8_encode = opt.enable_iri;
|
||||
xfree (iri->orig_url);
|
||||
|
||||
input_file = url_file;
|
||||
}
|
||||
else
|
||||
input_file = (char *) file;
|
||||
|
||||
bool read_again = false;
|
||||
do {
|
||||
url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
|
||||
: get_urls_file (input_file, &read_again));
|
||||
|
||||
status = retrieve_from_url_list(url_list, count, iri);
|
||||
} while (read_again);
|
||||
|
||||
xfree (url_file);
|
||||
|
||||
/* Free the linked list of URL-s. */
|
||||
free_urlpos (url_list);
|
||||
|
32
src/utils.c
32
src/utils.c
@ -1266,6 +1266,13 @@ has_html_suffix_p (const char *fname)
|
||||
return false;
|
||||
}
|
||||
|
||||
struct file_memory *
|
||||
wget_read_file (const char *file)
|
||||
{
|
||||
bool left_open;
|
||||
return wget_read_from_file(file, &left_open);
|
||||
}
|
||||
|
||||
/* Read FILE into memory. A pointer to `struct file_memory' are
|
||||
returned; use struct element `content' to access file contents, and
|
||||
the element `length' to know the file length. `content' is *not*
|
||||
@ -1283,7 +1290,7 @@ has_html_suffix_p (const char *fname)
|
||||
If you want to read from a real file named "-", use "./-" instead. */
|
||||
|
||||
struct file_memory *
|
||||
wget_read_file (const char *file)
|
||||
wget_read_from_file (const char *file, bool *left_open)
|
||||
{
|
||||
int fd;
|
||||
struct file_memory *fm;
|
||||
@ -1296,6 +1303,8 @@ wget_read_file (const char *file)
|
||||
if (HYPHENP (file))
|
||||
{
|
||||
fd = fileno (stdin);
|
||||
int flags = fcntl(fd, F_GETFL, 0);
|
||||
fcntl(fd, F_SETFL, flags | O_NONBLOCK);
|
||||
inhibit_close = true;
|
||||
/* Note that we don't inhibit mmap() in this case. If stdin is
|
||||
redirected from a regular file, mmap() will still work. */
|
||||
@ -1366,11 +1375,24 @@ wget_read_file (const char *file)
|
||||
/* Successful read. */
|
||||
fm->length += nread;
|
||||
else if (nread < 0)
|
||||
/* Error. */
|
||||
goto lose;
|
||||
{
|
||||
if (errno == EAGAIN)
|
||||
{
|
||||
*left_open = true;
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Error. */
|
||||
goto lose;
|
||||
}
|
||||
}
|
||||
else
|
||||
/* EOF */
|
||||
break;
|
||||
{
|
||||
/* EOF */
|
||||
*left_open = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!inhibit_close)
|
||||
close (fd);
|
||||
|
@ -106,6 +106,7 @@ bool has_wildcards_p (const char *);
|
||||
|
||||
bool has_html_suffix_p (const char *);
|
||||
|
||||
struct file_memory *wget_read_from_file (const char *, bool *);
|
||||
struct file_memory *wget_read_file (const char *);
|
||||
void wget_read_file_free (struct file_memory *);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user