Support continious reading from stdin pipes

Rather than reading from stdin only once, leave the pipe open until the other end closes it and keep reading from the file after each set of URLs is read * src/html-url.h(get_urls_file): Update prototype to add additional param * src/html-url.c(get_urls_file): Pass through read_again to wget_read_from_file. * src/retr.c(retrieve_from_file): Split the function into two. Introduce `retrieve_from_url_list` that actually performs the retrieval. Also, if `url_list` returns that the fd has been left open, then continue reading from it until the fd is closed. (retrieve_from_url_list): New function that does the retrieval from a list of URLs that was read from a file. * src/utils.c(wget_read_from_file): Rename old function `wget_read_file` to this. Accept an additional output parameter that states whether the fd was left open and if we should continue reading from it after the current set of URLs have been processed (wget_read_file): Write it as a new wrapper function around `wget_read_from_file` to maintain API comptability across other users
2025-03-13 11:20:19 +08:00 · 2024-05-12 17:48:21 +02:00 · 2024-05-12 17:48:21 +02:00 · 196ce0abd6
commit 196ce0abd6
parent ca10f20aaf
5 changed files with 110 additions and 74 deletions
--- a/src/html-url.c
+++ b/src/html-url.c
@ -874,20 +874,21 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
   to get_urls_html, so we put it here.  */

 struct urlpos *
-get_urls_file (const char *file)
+get_urls_file (const char *file, bool *read_again)
 {
  struct file_memory *fm;
  struct urlpos *head, *tail;
  const char *text, *text_end;

  /* Load the file.  */
-  fm = wget_read_file (file);
+  fm = wget_read_from_file (file, read_again);
  if (!fm)
    {
      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
      return NULL;
    }
-  DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
+  if (fm->length)
+    DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));

  head = tail = NULL;
  text = fm->content;
--- a/src/html-url.h
+++ b/src/html-url.h
@ -48,7 +48,7 @@ struct map_context {
  struct urlpos *head;          /* List of URLs that is being built. */
 };

-struct urlpos *get_urls_file (const char *);
+struct urlpos *get_urls_file (const char *, bool *);
 struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *);
 struct urlpos *get_urls_html_fm (const char *, const struct file_memory *, const char *, bool *, struct iri *);
 struct urlpos *append_url (const char *, int, int, struct map_context *);
--- a/src/retr.c
+++ b/src/retr.c
@ -1177,73 +1177,10 @@ bail:
  return result;
 }

-/* Find the URLs in the file and call retrieve_url() for each of them.
-   If HTML is true, treat the file as HTML, and construct the URLs
-   accordingly.
-
-   If opt.recursive is set, call retrieve_tree() for each file.  */
-
-uerr_t
-retrieve_from_file (const char *file, bool html, int *count)
+static uerr_t retrieve_from_url_list(struct urlpos *url_list, int *count, struct iri *iri)
 {
+  struct urlpos *cur_url;
  uerr_t status;
-  struct urlpos *url_list, *cur_url;
-  struct iri *iri = iri_new();
-
-  char *input_file, *url_file = NULL;
-  const char *url = file;
-
-  status = RETROK;             /* Suppose everything is OK.  */
-  *count = 0;                  /* Reset the URL count.  */
-
-  /* sXXXav : Assume filename and links in the file are in the locale */
-  set_uri_encoding (iri, opt.locale, true);
-  set_content_encoding (iri, opt.locale);
-
-  if (url_valid_scheme (url))
-    {
-      int dt,url_err;
-      struct url *url_parsed = url_parse (url, &url_err, iri, true);
-      if (!url_parsed)
-        {
-          logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (url_err));
-          iri_free (iri);
-          return URLERROR;
-        }
-
-      if (!opt.base_href)
-        opt.base_href = xstrdup (url);
-
-      status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt,
-                             false, iri, true);
-      url_free (url_parsed);
-
-      if (!url_file || (status != RETROK))
-        return status;
-
-      if (dt & TEXTHTML)
-        html = true;
-
-#ifdef ENABLE_IRI
-      /* If we have a found a content encoding, use it.
-       * ( == is okay, because we're checking for identical object) */
-      if (iri->content_encoding != opt.locale)
-          set_uri_encoding (iri, iri->content_encoding, false);
-#endif
-
-      /* Reset UTF-8 encode status */
-      iri->utf8_encode = opt.enable_iri;
-      xfree (iri->orig_url);
-
-      input_file = url_file;
-    }
-  else
-    input_file = (char *) file;
-
-  url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
-              : get_urls_file (input_file));
-
-  xfree (url_file);

  for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
    {
@ -1310,6 +1247,81 @@ Removing file due to --delete-after in retrieve_from_file():\n"));
      xfree (filename);
      iri_free (tmpiri);
    }
+  return status;
+}
+
+/* Find the URLs in the file and call retrieve_url() for each of them.
+   If HTML is true, treat the file as HTML, and construct the URLs
+   accordingly.
+
+   If opt.recursive is set, call retrieve_tree() for each file.  */
+
+uerr_t
+retrieve_from_file (const char *file, bool html, int *count)
+{
+  uerr_t status;
+  struct urlpos *url_list, *cur_url;
+  struct iri *iri = iri_new();
+
+  char *input_file, *url_file = NULL;
+  const char *url = file;
+
+  status = RETROK;             /* Suppose everything is OK.  */
+  *count = 0;                  /* Reset the URL count.  */
+
+  /* sXXXav : Assume filename and links in the file are in the locale */
+  set_uri_encoding (iri, opt.locale, true);
+  set_content_encoding (iri, opt.locale);
+
+  if (url_valid_scheme (url))
+    {
+      int dt,url_err;
+      struct url *url_parsed = url_parse (url, &url_err, iri, true);
+      if (!url_parsed)
+        {
+          logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (url_err));
+          iri_free (iri);
+          return URLERROR;
+        }
+
+      if (!opt.base_href)
+        opt.base_href = xstrdup (url);
+
+      status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt,
+                             false, iri, true);
+      url_free (url_parsed);
+
+      if (!url_file || (status != RETROK))
+        return status;
+
+      if (dt & TEXTHTML)
+        html = true;
+
+#ifdef ENABLE_IRI
+      /* If we have a found a content encoding, use it.
+       * ( == is okay, because we're checking for identical object) */
+      if (iri->content_encoding != opt.locale)
+          set_uri_encoding (iri, iri->content_encoding, false);
+#endif
+
+      /* Reset UTF-8 encode status */
+      iri->utf8_encode = opt.enable_iri;
+      xfree (iri->orig_url);
+
+      input_file = url_file;
+    }
+  else
+    input_file = (char *) file;
+
+  bool read_again = false;
+  do {
+    url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
+                : get_urls_file (input_file, &read_again));
+
+    status = retrieve_from_url_list(url_list, count, iri);
+  } while (read_again);
+
+  xfree (url_file);

  /* Free the linked list of URL-s.  */
  free_urlpos (url_list);
--- a/src/utils.c
+++ b/src/utils.c
@ -1266,6 +1266,13 @@ has_html_suffix_p (const char *fname)
  return false;
 }

+struct file_memory *
+wget_read_file (const char *file)
+{
+  bool left_open;
+  return wget_read_from_file(file, &left_open);
+}
+
 /* Read FILE into memory.  A pointer to `struct file_memory' are
   returned; use struct element `content' to access file contents, and
   the element `length' to know the file length.  `content' is *not*
@ -1283,7 +1290,7 @@ has_html_suffix_p (const char *fname)
   If you want to read from a real file named "-", use "./-" instead.  */

 struct file_memory *
-wget_read_file (const char *file)
+wget_read_from_file (const char *file, bool *left_open)
 {
  int fd;
  struct file_memory *fm;
@ -1296,6 +1303,8 @@ wget_read_file (const char *file)
  if (HYPHENP (file))
    {
      fd = fileno (stdin);
+      int flags = fcntl(fd, F_GETFL, 0);
+      fcntl(fd, F_SETFL, flags | O_NONBLOCK);
      inhibit_close = true;
      /* Note that we don't inhibit mmap() in this case.  If stdin is
         redirected from a regular file, mmap() will still work.  */
@ -1366,11 +1375,24 @@ wget_read_file (const char *file)
        /* Successful read. */
        fm->length += nread;
      else if (nread < 0)
-        /* Error. */
-        goto lose;
+        {
+          if (errno == EAGAIN)
+            {
+              *left_open = true;
+              break;
+            }
+          else
+            {
+              /* Error. */
+              goto lose;
+            }
+        }
      else
-        /* EOF */
-        break;
+        {
+          /* EOF */
+          *left_open = false;
+          break;
+        }
    }
  if (!inhibit_close)
    close (fd);
--- a/src/utils.h
+++ b/src/utils.h
@ -106,6 +106,7 @@ bool has_wildcards_p (const char *);

 bool has_html_suffix_p (const char *);

+struct file_memory *wget_read_from_file (const char *, bool *);
 struct file_memory *wget_read_file (const char *);
 void wget_read_file_free (struct file_memory *);