From 196ce0abd67b9c905c600a217a7d2dd667216081 Mon Sep 17 00:00:00 2001
From: Darshit Shah <darnir@gnu.org>
Date: Sun, 12 May 2024 17:48:21 +0200
Subject: [PATCH] Support continious reading from stdin pipes

Rather than reading from stdin only once, leave the pipe open until the
other end closes it and keep reading from the file after each set of
URLs is read

* src/html-url.h(get_urls_file): Update prototype to add additional
  param
* src/html-url.c(get_urls_file): Pass through read_again to
  wget_read_from_file.
* src/retr.c(retrieve_from_file): Split the function into two. Introduce
  `retrieve_from_url_list` that actually performs the retrieval.
  Also, if `url_list` returns that the fd has been left open, then
  continue reading from it until the fd is closed.
  (retrieve_from_url_list): New function that does the retrieval from
  a list of URLs that was read from a file.
* src/utils.c(wget_read_from_file): Rename old function `wget_read_file`
  to this.
  Accept an additional output parameter that states whether the fd was
  left open and if we should continue reading from it after the current
  set of URLs have been processed
  (wget_read_file): Write it as a new wrapper function around
  `wget_read_from_file` to maintain API comptability across other users
---
 src/html-url.c |   7 +--
 src/html-url.h |   2 +-
 src/retr.c     | 142 +++++++++++++++++++++++++++----------------------
 src/utils.c    |  32 +++++++++--
 src/utils.h    |   1 +
 5 files changed, 110 insertions(+), 74 deletions(-)

diff --git a/src/html-url.c b/src/html-url.c
index a4ba1150..8e960092 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -874,20 +874,21 @@ get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
    to get_urls_html, so we put it here.  */
 
 struct urlpos *
-get_urls_file (const char *file)
+get_urls_file (const char *file, bool *read_again)
 {
   struct file_memory *fm;
   struct urlpos *head, *tail;
   const char *text, *text_end;
 
   /* Load the file.  */
-  fm = wget_read_file (file);
+  fm = wget_read_from_file (file, read_again);
   if (!fm)
     {
       logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
       return NULL;
     }
-  DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
+  if (fm->length)
+    DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
 
   head = tail = NULL;
   text = fm->content;
diff --git a/src/html-url.h b/src/html-url.h
index d4158164..cf80e43d 100644
--- a/src/html-url.h
+++ b/src/html-url.h
@@ -48,7 +48,7 @@ struct map_context {
   struct urlpos *head;          /* List of URLs that is being built. */
 };
 
-struct urlpos *get_urls_file (const char *);
+struct urlpos *get_urls_file (const char *, bool *);
 struct urlpos *get_urls_html (const char *, const char *, bool *, struct iri *);
 struct urlpos *get_urls_html_fm (const char *, const struct file_memory *, const char *, bool *, struct iri *);
 struct urlpos *append_url (const char *, int, int, struct map_context *);
diff --git a/src/retr.c b/src/retr.c
index 3eea0e06..fc314caa 100644
--- a/src/retr.c
+++ b/src/retr.c
@@ -1177,73 +1177,10 @@ bail:
   return result;
 }
 
-/* Find the URLs in the file and call retrieve_url() for each of them.
-   If HTML is true, treat the file as HTML, and construct the URLs
-   accordingly.
-
-   If opt.recursive is set, call retrieve_tree() for each file.  */
-
-uerr_t
-retrieve_from_file (const char *file, bool html, int *count)
+static uerr_t retrieve_from_url_list(struct urlpos *url_list, int *count, struct iri *iri)
 {
+  struct urlpos *cur_url;
   uerr_t status;
-  struct urlpos *url_list, *cur_url;
-  struct iri *iri = iri_new();
-
-  char *input_file, *url_file = NULL;
-  const char *url = file;
-
-  status = RETROK;             /* Suppose everything is OK.  */
-  *count = 0;                  /* Reset the URL count.  */
-
-  /* sXXXav : Assume filename and links in the file are in the locale */
-  set_uri_encoding (iri, opt.locale, true);
-  set_content_encoding (iri, opt.locale);
-
-  if (url_valid_scheme (url))
-    {
-      int dt,url_err;
-      struct url *url_parsed = url_parse (url, &url_err, iri, true);
-      if (!url_parsed)
-        {
-          logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (url_err));
-          iri_free (iri);
-          return URLERROR;
-        }
-
-      if (!opt.base_href)
-        opt.base_href = xstrdup (url);
-
-      status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt,
-                             false, iri, true);
-      url_free (url_parsed);
-
-      if (!url_file || (status != RETROK))
-        return status;
-
-      if (dt & TEXTHTML)
-        html = true;
-
-#ifdef ENABLE_IRI
-      /* If we have a found a content encoding, use it.
-       * ( == is okay, because we're checking for identical object) */
-      if (iri->content_encoding != opt.locale)
-          set_uri_encoding (iri, iri->content_encoding, false);
-#endif
-
-      /* Reset UTF-8 encode status */
-      iri->utf8_encode = opt.enable_iri;
-      xfree (iri->orig_url);
-
-      input_file = url_file;
-    }
-  else
-    input_file = (char *) file;
-
-  url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
-              : get_urls_file (input_file));
-
-  xfree (url_file);
 
   for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
     {
@@ -1310,6 +1247,81 @@ Removing file due to --delete-after in retrieve_from_file():\n"));
       xfree (filename);
       iri_free (tmpiri);
     }
+  return status;
+}
+
+/* Find the URLs in the file and call retrieve_url() for each of them.
+   If HTML is true, treat the file as HTML, and construct the URLs
+   accordingly.
+
+   If opt.recursive is set, call retrieve_tree() for each file.  */
+
+uerr_t
+retrieve_from_file (const char *file, bool html, int *count)
+{
+  uerr_t status;
+  struct urlpos *url_list, *cur_url;
+  struct iri *iri = iri_new();
+
+  char *input_file, *url_file = NULL;
+  const char *url = file;
+
+  status = RETROK;             /* Suppose everything is OK.  */
+  *count = 0;                  /* Reset the URL count.  */
+
+  /* sXXXav : Assume filename and links in the file are in the locale */
+  set_uri_encoding (iri, opt.locale, true);
+  set_content_encoding (iri, opt.locale);
+
+  if (url_valid_scheme (url))
+    {
+      int dt,url_err;
+      struct url *url_parsed = url_parse (url, &url_err, iri, true);
+      if (!url_parsed)
+        {
+          logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (url_err));
+          iri_free (iri);
+          return URLERROR;
+        }
+
+      if (!opt.base_href)
+        opt.base_href = xstrdup (url);
+
+      status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt,
+                             false, iri, true);
+      url_free (url_parsed);
+
+      if (!url_file || (status != RETROK))
+        return status;
+
+      if (dt & TEXTHTML)
+        html = true;
+
+#ifdef ENABLE_IRI
+      /* If we have a found a content encoding, use it.
+       * ( == is okay, because we're checking for identical object) */
+      if (iri->content_encoding != opt.locale)
+          set_uri_encoding (iri, iri->content_encoding, false);
+#endif
+
+      /* Reset UTF-8 encode status */
+      iri->utf8_encode = opt.enable_iri;
+      xfree (iri->orig_url);
+
+      input_file = url_file;
+    }
+  else
+    input_file = (char *) file;
+
+  bool read_again = false;
+  do {
+    url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
+                : get_urls_file (input_file, &read_again));
+
+    status = retrieve_from_url_list(url_list, count, iri);
+  } while (read_again);
+
+  xfree (url_file);
 
   /* Free the linked list of URL-s.  */
   free_urlpos (url_list);
diff --git a/src/utils.c b/src/utils.c
index 9caaf727..58de4a72 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -1266,6 +1266,13 @@ has_html_suffix_p (const char *fname)
   return false;
 }
 
+struct file_memory *
+wget_read_file (const char *file)
+{
+  bool left_open;
+  return wget_read_from_file(file, &left_open);
+}
+
 /* Read FILE into memory.  A pointer to `struct file_memory' are
    returned; use struct element `content' to access file contents, and
    the element `length' to know the file length.  `content' is *not*
@@ -1283,7 +1290,7 @@ has_html_suffix_p (const char *fname)
    If you want to read from a real file named "-", use "./-" instead.  */
 
 struct file_memory *
-wget_read_file (const char *file)
+wget_read_from_file (const char *file, bool *left_open)
 {
   int fd;
   struct file_memory *fm;
@@ -1296,6 +1303,8 @@ wget_read_file (const char *file)
   if (HYPHENP (file))
     {
       fd = fileno (stdin);
+      int flags = fcntl(fd, F_GETFL, 0);
+      fcntl(fd, F_SETFL, flags | O_NONBLOCK);
       inhibit_close = true;
       /* Note that we don't inhibit mmap() in this case.  If stdin is
          redirected from a regular file, mmap() will still work.  */
@@ -1366,11 +1375,24 @@ wget_read_file (const char *file)
         /* Successful read. */
         fm->length += nread;
       else if (nread < 0)
-        /* Error. */
-        goto lose;
+        {
+          if (errno == EAGAIN)
+            {
+              *left_open = true;
+              break;
+            }
+          else
+            {
+              /* Error. */
+              goto lose;
+            }
+        }
       else
-        /* EOF */
-        break;
+        {
+          /* EOF */
+          *left_open = false;
+          break;
+        }
     }
   if (!inhibit_close)
     close (fd);
diff --git a/src/utils.h b/src/utils.h
index 842782d1..2e6a979f 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -106,6 +106,7 @@ bool has_wildcards_p (const char *);
 
 bool has_html_suffix_p (const char *);
 
+struct file_memory *wget_read_from_file (const char *, bool *);
 struct file_memory *wget_read_file (const char *);
 void wget_read_file_free (struct file_memory *);