From f4d019a423d5f3fdaec42116a8b997536a01da45 Mon Sep 17 00:00:00 2001
From: hniksic <devnull@localhost>
Date: Fri, 30 Nov 2001 20:18:51 -0800
Subject: [PATCH] [svn] Correctly convert links in <meta http-equiv=Refresh
 content="...">. Published in <sxsadx3wp49.fsf@florida.arsdigita.de>.

---
 src/ChangeLog  | 12 ++++++++++++
 src/html-url.c | 28 +++++++++++++++++-----------
 src/url.c      | 39 ++++++++++++++++++++++++++++++++++++---
 src/url.h      |  4 ++++
 4 files changed, 69 insertions(+), 14 deletions(-)

diff --git a/src/ChangeLog b/src/ChangeLog
index 9d1bc81b..45f06eab 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,15 @@
+2001-12-01  Hrvoje Niksic  <hniksic@arsdigita.com>
+
+	* url.c (replace_attr_refresh_hack): New function.
+	(convert_links): Call replace_attr_refresh_hack for Refresh
+	links.  It will add the "TMOUT; URL=" junk before the link.
+
+	* html-url.c (collect_tags_mapper): Set ID to the ID of the
+	"content" attribute, not "http-equiv".
+	(collect_tags_mapper): Don't use OFFSET to hack the raw_* values;
+	instead, store the information that this entry belongs to a
+	"refresh" link.
+
 2001-12-01  Hrvoje Niksic  <hniksic@arsdigita.com>
 
 	* recur.c (retrieve_tree): Allow -p retrievals to exceed maximum
diff --git a/src/html-url.c b/src/html-url.c
index 5942a49f..7b9bf29a 100644
--- a/src/html-url.c
+++ b/src/html-url.c
@@ -482,18 +482,21 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
 	     So we just need to skip past the "NUMBER; URL=" garbage
 	     to get to the URL.  */
 	  {
-	    int id;
 	    char *name = find_attr (tag, "name", NULL);
-	    char *http_equiv = find_attr (tag, "http-equiv", &id);
+	    char *http_equiv = find_attr (tag, "http-equiv", NULL);
 	    if (http_equiv && !strcasecmp (http_equiv, "refresh"))
 	      {
-		char *refresh = find_attr (tag, "content", NULL);
-		char *p = refresh;
-		int offset;
-		while (ISDIGIT (*p))
-		  ++p;
+		struct urlpos *entry;
+
+		int id;
+		char *p, *refresh = find_attr (tag, "content", &id);
+		int timeout = 0;
+
+		for (p = refresh; ISDIGIT (*p); p++)
+		  timeout = 10 * timeout + *p - '0';
 		if (*p++ != ';')
 		  return;
+
 		while (ISSPACE (*p))
 		  ++p;
 		if (!(TOUPPER (*p) == 'U'
@@ -504,10 +507,13 @@ collect_tags_mapper (struct taginfo *tag, void *arg)
 		p += 4;
 		while (ISSPACE (*p))
 		  ++p;
-		offset = p - refresh;
-		tag->attrs[id].value_raw_beginning += offset;
-		tag->attrs[id].value_raw_size -= offset;
-		handle_link (closure, p, tag, id);
+
+		entry = handle_link (closure, p, tag, id);
+		if (entry)
+		  {
+		    entry->link_refresh_p = 1;
+		    entry->refresh_timeout = timeout;
+		  }
 	      }
 	    else if (name && !strcasecmp (name, "robots"))
 	      {
diff --git a/src/url.c b/src/url.c
index 28e41c8c..38469418 100644
--- a/src/url.c
+++ b/src/url.c
@@ -1698,7 +1698,10 @@ no_proxy_match (const char *host, const char **no_proxy)
 }
 
 static void write_backup_file PARAMS ((const char *, downloaded_file_t));
-static const char *replace_attr PARAMS ((const char *, int, FILE *, const char *));
+static const char *replace_attr PARAMS ((const char *, int, FILE *,
+					 const char *));
+static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,
+						      const char *, int));
 static char *local_quote_string PARAMS ((const char *));
 
 /* Change the links in one HTML file.  LINKS is a list of links in the
@@ -1797,7 +1800,13 @@ convert_links (const char *file, struct urlpos *links)
 	  {
 	    char *newname = construct_relative (file, link->local_name);
 	    char *quoted_newname = local_quote_string (newname);
-	    p = replace_attr (p, link->size, fp, quoted_newname);
+
+	    if (!link->link_refresh_p)
+	      p = replace_attr (p, link->size, fp, quoted_newname);
+	    else
+	      p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
+					     link->refresh_timeout);
+
 	    DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
 		     link->url->url, newname, link->pos, file));
 	    xfree (newname);
@@ -1810,7 +1819,13 @@ convert_links (const char *file, struct urlpos *links)
 	  {
 	    char *newlink = link->url->url;
 	    char *quoted_newlink = html_quote_string (newlink);
-	    p = replace_attr (p, link->size, fp, quoted_newlink);
+
+	    if (!link->link_refresh_p)
+	      p = replace_attr (p, link->size, fp, quoted_newlink);
+	    else
+	      p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,
+					     link->refresh_timeout);
+
 	    DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",
 		     newlink, link->pos, file));
 	    xfree (quoted_newlink);
@@ -2014,6 +2029,24 @@ replace_attr (const char *p, int size, FILE *fp, const char *new_text)
   return p;
 }
 
+/* The same as REPLACE_ATTR, but used when replacing
+   <meta http-equiv=refresh content="new_text"> because we need to
+   append "timeout_value; URL=" before the next_text.  */
+
+static const char *
+replace_attr_refresh_hack (const char *p, int size, FILE *fp,
+			   const char *new_text, int timeout)
+{
+  /* "0; URL=..." */
+  char *new_with_timeout = (char *)alloca (numdigit (timeout)
+					   + 6 /* "; URL=" */
+					   + strlen (new_text)
+					   + 1);
+  sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);
+
+  return replace_attr (p, size, fp, new_with_timeout);
+}
+
 /* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not
    preceded by '&'.  If the character is not found, return zero.  If
    the character is found, return 1 and set BP and EP to point to the
diff --git a/src/url.h b/src/url.h
index 3c42c5aa..b99c64ce 100644
--- a/src/url.h
+++ b/src/url.h
@@ -91,6 +91,10 @@ struct urlpos {
   unsigned int link_base_p	:1; /* was the link <base href=...> */
   unsigned int link_inline_p	:1; /* needed to render the page. */
 
+  unsigned int link_refresh_p	:1; /* link was received from
+				       <meta http-equiv=refresh content=...> */
+  int refresh_timeout;		/* for reconstructing the refresh. */
+
   /* Conversion requirements: */
   enum convert_options convert;	/* is conversion required? */