From 196ce0abd67b9c905c600a217a7d2dd667216081 Mon Sep 17 00:00:00 2001 From: Darshit Shah Date: Sun, 12 May 2024 17:48:21 +0200 Subject: [PATCH] Support continious reading from stdin pipes Rather than reading from stdin only once, leave the pipe open until the other end closes it and keep reading from the file after each set of URLs is read * src/html-url.h(get_urls_file): Update prototype to add additional param * src/html-url.c(get_urls_file): Pass through read_again to wget_read_from_file. * src/retr.c(retrieve_from_file): Split the function into two. Introduce `retrieve_from_url_list` that actually performs the retrieval. Also, if `url_list` returns that the fd has been left open, then continue reading from it until the fd is closed. (retrieve_from_url_list): New function that does the retrieval from a list of URLs that was read from a file. * src/utils.c(wget_read_from_file): Rename old function `wget_read_file` to this. Accept an additional output parameter that states whether the fd was left open and if we should continue reading from it after the current set of URLs have been processed (wget_read_file): Write it as a new wrapper function around `wget_read_from_file` to maintain API comptability across other users --- src/html-url.c | 7 +-- src/html-url.h | 2 +- src/retr.c | 142 +++++++++++++++++++++++++++---------------------- src/utils.c | 32 +++++++++-- src/utils.h | 1 + 5 files changed, 110 insertions(+), 74 deletions(-) diff --git a/src/html-url.c b/src/html-url.c index a4ba1150..8e960092 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -874,20 +874,21 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow, to get_urls_html, so we put it here. */ struct urlpos * -get_urls_file (const char *file) +get_urls_file (const char *file, bool *read_again) { struct file_memory *fm; struct urlpos *head, *tail; const char *text, *text_end; /* Load the file. */ - fm = wget_read_file (file); + fm = wget_read_from_file (file, read_again); if (!fm) { logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); return NULL; } - DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); + if (fm->length) + DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); head = tail = NULL; text = fm->content; diff --git a/src/html-url.h b/src/html-url.h index d4158164..cf80e43d 100644 --- a/src/html-url.h +++ b/src/html-url.h @@ -48,7 +48,7 @@ struct map_context { struct urlpos *head; /* List of URLs that is being built. */ }; -struct urlpos *get_urls_file (const char *); +struct urlpos *get_urls_file (const char *, bool *); struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *); struct urlpos *get_urls_html_fm (const char *, const struct file_memory *, const char *, bool *, struct iri *); struct urlpos *append_url (const char *, int, int, struct map_context *); diff --git a/src/retr.c b/src/retr.c index 3eea0e06..fc314caa 100644 --- a/src/retr.c +++ b/src/retr.c @@ -1177,73 +1177,10 @@ bail: return result; } -/* Find the URLs in the file and call retrieve_url() for each of them. - If HTML is true, treat the file as HTML, and construct the URLs - accordingly. - - If opt.recursive is set, call retrieve_tree() for each file. */ - -uerr_t -retrieve_from_file (const char *file, bool html, int *count) +static uerr_t retrieve_from_url_list(struct urlpos *url_list, int *count, struct iri *iri) { + struct urlpos *cur_url; uerr_t status; - struct urlpos *url_list, *cur_url; - struct iri *iri = iri_new(); - - char *input_file, *url_file = NULL; - const char *url = file; - - status = RETROK; /* Suppose everything is OK. */ - *count = 0; /* Reset the URL count. */ - - /* sXXXav : Assume filename and links in the file are in the locale */ - set_uri_encoding (iri, opt.locale, true); - set_content_encoding (iri, opt.locale); - - if (url_valid_scheme (url)) - { - int dt,url_err; - struct url *url_parsed = url_parse (url, &url_err, iri, true); - if (!url_parsed) - { - logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (url_err)); - iri_free (iri); - return URLERROR; - } - - if (!opt.base_href) - opt.base_href = xstrdup (url); - - status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt, - false, iri, true); - url_free (url_parsed); - - if (!url_file || (status != RETROK)) - return status; - - if (dt & TEXTHTML) - html = true; - -#ifdef ENABLE_IRI - /* If we have a found a content encoding, use it. - * ( == is okay, because we're checking for identical object) */ - if (iri->content_encoding != opt.locale) - set_uri_encoding (iri, iri->content_encoding, false); -#endif - - /* Reset UTF-8 encode status */ - iri->utf8_encode = opt.enable_iri; - xfree (iri->orig_url); - - input_file = url_file; - } - else - input_file = (char *) file; - - url_list = (html ? get_urls_html (input_file, NULL, NULL, iri) - : get_urls_file (input_file)); - - xfree (url_file); for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count) { @@ -1310,6 +1247,81 @@ Removing file due to --delete-after in retrieve_from_file():\n")); xfree (filename); iri_free (tmpiri); } + return status; +} + +/* Find the URLs in the file and call retrieve_url() for each of them. + If HTML is true, treat the file as HTML, and construct the URLs + accordingly. + + If opt.recursive is set, call retrieve_tree() for each file. */ + +uerr_t +retrieve_from_file (const char *file, bool html, int *count) +{ + uerr_t status; + struct urlpos *url_list, *cur_url; + struct iri *iri = iri_new(); + + char *input_file, *url_file = NULL; + const char *url = file; + + status = RETROK; /* Suppose everything is OK. */ + *count = 0; /* Reset the URL count. */ + + /* sXXXav : Assume filename and links in the file are in the locale */ + set_uri_encoding (iri, opt.locale, true); + set_content_encoding (iri, opt.locale); + + if (url_valid_scheme (url)) + { + int dt,url_err; + struct url *url_parsed = url_parse (url, &url_err, iri, true); + if (!url_parsed) + { + logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (url_err)); + iri_free (iri); + return URLERROR; + } + + if (!opt.base_href) + opt.base_href = xstrdup (url); + + status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt, + false, iri, true); + url_free (url_parsed); + + if (!url_file || (status != RETROK)) + return status; + + if (dt & TEXTHTML) + html = true; + +#ifdef ENABLE_IRI + /* If we have a found a content encoding, use it. + * ( == is okay, because we're checking for identical object) */ + if (iri->content_encoding != opt.locale) + set_uri_encoding (iri, iri->content_encoding, false); +#endif + + /* Reset UTF-8 encode status */ + iri->utf8_encode = opt.enable_iri; + xfree (iri->orig_url); + + input_file = url_file; + } + else + input_file = (char *) file; + + bool read_again = false; + do { + url_list = (html ? get_urls_html (input_file, NULL, NULL, iri) + : get_urls_file (input_file, &read_again)); + + status = retrieve_from_url_list(url_list, count, iri); + } while (read_again); + + xfree (url_file); /* Free the linked list of URL-s. */ free_urlpos (url_list); diff --git a/src/utils.c b/src/utils.c index 9caaf727..58de4a72 100644 --- a/src/utils.c +++ b/src/utils.c @@ -1266,6 +1266,13 @@ has_html_suffix_p (const char *fname) return false; } +struct file_memory * +wget_read_file (const char *file) +{ + bool left_open; + return wget_read_from_file(file, &left_open); +} + /* Read FILE into memory. A pointer to `struct file_memory' are returned; use struct element `content' to access file contents, and the element `length' to know the file length. `content' is *not* @@ -1283,7 +1290,7 @@ has_html_suffix_p (const char *fname) If you want to read from a real file named "-", use "./-" instead. */ struct file_memory * -wget_read_file (const char *file) +wget_read_from_file (const char *file, bool *left_open) { int fd; struct file_memory *fm; @@ -1296,6 +1303,8 @@ wget_read_file (const char *file) if (HYPHENP (file)) { fd = fileno (stdin); + int flags = fcntl(fd, F_GETFL, 0); + fcntl(fd, F_SETFL, flags | O_NONBLOCK); inhibit_close = true; /* Note that we don't inhibit mmap() in this case. If stdin is redirected from a regular file, mmap() will still work. */ @@ -1366,11 +1375,24 @@ wget_read_file (const char *file) /* Successful read. */ fm->length += nread; else if (nread < 0) - /* Error. */ - goto lose; + { + if (errno == EAGAIN) + { + *left_open = true; + break; + } + else + { + /* Error. */ + goto lose; + } + } else - /* EOF */ - break; + { + /* EOF */ + *left_open = false; + break; + } } if (!inhibit_close) close (fd); diff --git a/src/utils.h b/src/utils.h index 842782d1..2e6a979f 100644 --- a/src/utils.h +++ b/src/utils.h @@ -106,6 +106,7 @@ bool has_wildcards_p (const char *); bool has_html_suffix_p (const char *); +struct file_memory *wget_read_from_file (const char *, bool *); struct file_memory *wget_read_file (const char *); void wget_read_file_free (struct file_memory *);