Support continious reading from stdin pipes

Rather than reading from stdin only once, leave the pipe open until the
other end closes it and keep reading from the file after each set of
URLs is read

* src/html-url.h(get_urls_file): Update prototype to add additional
  param
* src/html-url.c(get_urls_file): Pass through read_again to
  wget_read_from_file.
* src/retr.c(retrieve_from_file): Split the function into two. Introduce
  `retrieve_from_url_list` that actually performs the retrieval.
  Also, if `url_list` returns that the fd has been left open, then
  continue reading from it until the fd is closed.
  (retrieve_from_url_list): New function that does the retrieval from
  a list of URLs that was read from a file.
* src/utils.c(wget_read_from_file): Rename old function `wget_read_file`
  to this.
  Accept an additional output parameter that states whether the fd was
  left open and if we should continue reading from it after the current
  set of URLs have been processed
  (wget_read_file): Write it as a new wrapper function around
  `wget_read_from_file` to maintain API comptability across other users
This commit is contained in:
Darshit Shah 2024-05-12 17:48:21 +02:00
parent ca10f20aaf
commit 196ce0abd6
5 changed files with 110 additions and 74 deletions

View File

@ -874,20 +874,21 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
to get_urls_html, so we put it here. */ to get_urls_html, so we put it here. */
struct urlpos * struct urlpos *
get_urls_file (const char *file) get_urls_file (const char *file, bool *read_again)
{ {
struct file_memory *fm; struct file_memory *fm;
struct urlpos *head, *tail; struct urlpos *head, *tail;
const char *text, *text_end; const char *text, *text_end;
/* Load the file. */ /* Load the file. */
fm = wget_read_file (file); fm = wget_read_from_file (file, read_again);
if (!fm) if (!fm)
{ {
logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
return NULL; return NULL;
} }
DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); if (fm->length)
DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
head = tail = NULL; head = tail = NULL;
text = fm->content; text = fm->content;

View File

@ -48,7 +48,7 @@ struct map_context {
struct urlpos *head; /* List of URLs that is being built. */ struct urlpos *head; /* List of URLs that is being built. */
}; };
struct urlpos *get_urls_file (const char *); struct urlpos *get_urls_file (const char *, bool *);
struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *); struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *);
struct urlpos *get_urls_html_fm (const char *, const struct file_memory *, const char *, bool *, struct iri *); struct urlpos *get_urls_html_fm (const char *, const struct file_memory *, const char *, bool *, struct iri *);
struct urlpos *append_url (const char *, int, int, struct map_context *); struct urlpos *append_url (const char *, int, int, struct map_context *);

View File

@ -1177,73 +1177,10 @@ bail:
return result; return result;
} }
/* Find the URLs in the file and call retrieve_url() for each of them. static uerr_t retrieve_from_url_list(struct urlpos *url_list, int *count, struct iri *iri)
If HTML is true, treat the file as HTML, and construct the URLs
accordingly.
If opt.recursive is set, call retrieve_tree() for each file. */
uerr_t
retrieve_from_file (const char *file, bool html, int *count)
{ {
struct urlpos *cur_url;
uerr_t status; uerr_t status;
struct urlpos *url_list, *cur_url;
struct iri *iri = iri_new();
char *input_file, *url_file = NULL;
const char *url = file;
status = RETROK; /* Suppose everything is OK. */
*count = 0; /* Reset the URL count. */
/* sXXXav : Assume filename and links in the file are in the locale */
set_uri_encoding (iri, opt.locale, true);
set_content_encoding (iri, opt.locale);
if (url_valid_scheme (url))
{
int dt,url_err;
struct url *url_parsed = url_parse (url, &url_err, iri, true);
if (!url_parsed)
{
logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (url_err));
iri_free (iri);
return URLERROR;
}
if (!opt.base_href)
opt.base_href = xstrdup (url);
status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt,
false, iri, true);
url_free (url_parsed);
if (!url_file || (status != RETROK))
return status;
if (dt & TEXTHTML)
html = true;
#ifdef ENABLE_IRI
/* If we have a found a content encoding, use it.
* ( == is okay, because we're checking for identical object) */
if (iri->content_encoding != opt.locale)
set_uri_encoding (iri, iri->content_encoding, false);
#endif
/* Reset UTF-8 encode status */
iri->utf8_encode = opt.enable_iri;
xfree (iri->orig_url);
input_file = url_file;
}
else
input_file = (char *) file;
url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
: get_urls_file (input_file));
xfree (url_file);
for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count) for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
{ {
@ -1310,6 +1247,81 @@ Removing file due to --delete-after in retrieve_from_file():\n"));
xfree (filename); xfree (filename);
iri_free (tmpiri); iri_free (tmpiri);
} }
return status;
}
/* Find the URLs in the file and call retrieve_url() for each of them.
If HTML is true, treat the file as HTML, and construct the URLs
accordingly.
If opt.recursive is set, call retrieve_tree() for each file. */
uerr_t
retrieve_from_file (const char *file, bool html, int *count)
{
uerr_t status;
struct urlpos *url_list, *cur_url;
struct iri *iri = iri_new();
char *input_file, *url_file = NULL;
const char *url = file;
status = RETROK; /* Suppose everything is OK. */
*count = 0; /* Reset the URL count. */
/* sXXXav : Assume filename and links in the file are in the locale */
set_uri_encoding (iri, opt.locale, true);
set_content_encoding (iri, opt.locale);
if (url_valid_scheme (url))
{
int dt,url_err;
struct url *url_parsed = url_parse (url, &url_err, iri, true);
if (!url_parsed)
{
logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (url_err));
iri_free (iri);
return URLERROR;
}
if (!opt.base_href)
opt.base_href = xstrdup (url);
status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt,
false, iri, true);
url_free (url_parsed);
if (!url_file || (status != RETROK))
return status;
if (dt & TEXTHTML)
html = true;
#ifdef ENABLE_IRI
/* If we have a found a content encoding, use it.
* ( == is okay, because we're checking for identical object) */
if (iri->content_encoding != opt.locale)
set_uri_encoding (iri, iri->content_encoding, false);
#endif
/* Reset UTF-8 encode status */
iri->utf8_encode = opt.enable_iri;
xfree (iri->orig_url);
input_file = url_file;
}
else
input_file = (char *) file;
bool read_again = false;
do {
url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
: get_urls_file (input_file, &read_again));
status = retrieve_from_url_list(url_list, count, iri);
} while (read_again);
xfree (url_file);
/* Free the linked list of URL-s. */ /* Free the linked list of URL-s. */
free_urlpos (url_list); free_urlpos (url_list);

View File

@ -1266,6 +1266,13 @@ has_html_suffix_p (const char *fname)
return false; return false;
} }
struct file_memory *
wget_read_file (const char *file)
{
bool left_open;
return wget_read_from_file(file, &left_open);
}
/* Read FILE into memory. A pointer to `struct file_memory' are /* Read FILE into memory. A pointer to `struct file_memory' are
returned; use struct element `content' to access file contents, and returned; use struct element `content' to access file contents, and
the element `length' to know the file length. `content' is *not* the element `length' to know the file length. `content' is *not*
@ -1283,7 +1290,7 @@ has_html_suffix_p (const char *fname)
If you want to read from a real file named "-", use "./-" instead. */ If you want to read from a real file named "-", use "./-" instead. */
struct file_memory * struct file_memory *
wget_read_file (const char *file) wget_read_from_file (const char *file, bool *left_open)
{ {
int fd; int fd;
struct file_memory *fm; struct file_memory *fm;
@ -1296,6 +1303,8 @@ wget_read_file (const char *file)
if (HYPHENP (file)) if (HYPHENP (file))
{ {
fd = fileno (stdin); fd = fileno (stdin);
int flags = fcntl(fd, F_GETFL, 0);
fcntl(fd, F_SETFL, flags | O_NONBLOCK);
inhibit_close = true; inhibit_close = true;
/* Note that we don't inhibit mmap() in this case. If stdin is /* Note that we don't inhibit mmap() in this case. If stdin is
redirected from a regular file, mmap() will still work. */ redirected from a regular file, mmap() will still work. */
@ -1366,11 +1375,24 @@ wget_read_file (const char *file)
/* Successful read. */ /* Successful read. */
fm->length += nread; fm->length += nread;
else if (nread < 0) else if (nread < 0)
/* Error. */ {
goto lose; if (errno == EAGAIN)
{
*left_open = true;
break;
}
else
{
/* Error. */
goto lose;
}
}
else else
/* EOF */ {
break; /* EOF */
*left_open = false;
break;
}
} }
if (!inhibit_close) if (!inhibit_close)
close (fd); close (fd);

View File

@ -106,6 +106,7 @@ bool has_wildcards_p (const char *);
bool has_html_suffix_p (const char *); bool has_html_suffix_p (const char *);
struct file_memory *wget_read_from_file (const char *, bool *);
struct file_memory *wget_read_file (const char *); struct file_memory *wget_read_file (const char *);
void wget_read_file_free (struct file_memory *); void wget_read_file_free (struct file_memory *);