From e8e87978737f88a536f6606b9029ece7b1001b4e Mon Sep 17 00:00:00 2001
From: hniksic <devnull@localhost>
Date: Sun, 18 Nov 2001 17:14:14 -0800
Subject: [PATCH] [svn] Rewrite shorthand URLs in a step separate from parsing.
 Published in <sxspu6f7ecz.fsf@florida.arsdigita.de>.

---
 src/ChangeLog |  6 ++++++
 src/main.c    | 14 +++++++++---
 src/url.c     | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++
 src/url.h     |  2 ++
 4 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/src/ChangeLog b/src/ChangeLog
index c37a399c..f375f50f 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,9 @@
+2001-11-19  Hrvoje Niksic  <hniksic@arsdigita.com>
+
+	* main.c (main): Use it.
+
+	* url.c (rewrite_url_maybe): New function.
+
 2001-11-19  Hrvoje Niksic  <hniksic@arsdigita.com>
 
 	* url.c: Clean up handling of URL schemes.
diff --git a/src/main.c b/src/main.c
index 6a79e0b0..092a455f 100644
--- a/src/main.c
+++ b/src/main.c
@@ -50,6 +50,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
 #include "recur.h"
 #include "host.h"
 #include "cookies.h"
+#include "url.h"
 
 /* On GNU system this will include system-wide getopt.h. */
 #include "getopt.h"
@@ -739,9 +740,14 @@ Can't timestamp and not clobber old files at the same time.\n"));
   /* Fill in the arguments.  */
   for (i = 0; i < nurl; i++, optind++)
     {
-      char *irix4_cc_needs_this;
-      STRDUP_ALLOCA (irix4_cc_needs_this, argv[optind]);
-      url[i] = irix4_cc_needs_this;
+      char *rewritten = rewrite_url_maybe (argv[optind]);
+      if (rewritten)
+	{
+	  printf ("Converted %s to %s\n", argv[optind], rewritten);
+	  url[i] = rewritten;
+	}
+      else
+	url[i] = xstrdup (argv[optind]);
     }
   url[i] = NULL;
 
@@ -853,6 +859,8 @@ Can't timestamp and not clobber old files at the same time.\n"));
       convert_all_links ();
     }
   log_close ();
+  for (i = 0; i < nurl; i++)
+    free (url[i]);
   cleanup ();
 #ifdef DEBUG_MALLOC
   print_malloc_debug_stats ();
diff --git a/src/url.c b/src/url.c
index c1c2b596..2159bb91 100644
--- a/src/url.c
+++ b/src/url.c
@@ -296,6 +296,66 @@ url_skip_uname (const char *url)
   else
     return 0;
 }
+
+/* Used by main.c: detect URLs written using the "shorthand" URL forms
+   popularized by Netscape and NcFTP.  HTTP shorthands look like this:
+
+   www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
+   www.foo.com[:port]            -> http://www.foo.com[:port]
+
+   FTP shorthands look like this:
+
+   foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
+   foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
+
+   If the URL needs not or cannot be rewritten, return NULL.  */
+char *
+rewrite_url_maybe (const char *url)
+{
+  const char *p;
+
+  if (url_has_scheme (url))
+    return NULL;
+
+  /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
+     latter Netscape.  */
+  for (p = url; *p && *p != ':' && *p != '/'; p++)
+    ;
+
+  if (p == url)
+    return NULL;
+
+  if (*p == ':')
+    {
+      const char *pp, *path;
+      char *res;
+      /* If the characters after the colon and before the next slash
+	 or end of string are all digits, it's HTTP.  */
+      int digits = 0;
+      for (pp = p + 1; ISDIGIT (*pp); pp++)
+	++digits;
+      if (digits > 0
+	  && (*pp == '/' || *pp == '\0'))
+	goto http;
+
+      /* Prepend "ftp://" to the entire URL... */
+      path = p + 1;
+      res = xmalloc (6 + strlen (url) + 1);
+      sprintf (res, "ftp://%s", url);
+      /* ...and replace ':' with '/'. */
+      res[6 + (p - url)] = '/';
+      return res;
+    }
+  else
+    {
+      char *res;
+    http:
+      /* Just prepend "http://" to what we have. */
+      res = xmalloc (7 + strlen (url) + 1);
+      sprintf (res, "http://%s", url);
+      return res;
+    }
+}
 
 /* Allocate a new urlinfo structure, fill it with default values and
    return a pointer to it.  */
diff --git a/src/url.h b/src/url.h
index 3c74a0e2..e9bcc379 100644
--- a/src/url.h
+++ b/src/url.h
@@ -137,4 +137,6 @@ urlpos *add_url PARAMS ((urlpos *, const char *, const char *));
 
 downloaded_file_t downloaded_file PARAMS ((downloaded_file_t, const char *));
 
+char *rewrite_url_maybe PARAMS ((const char *));
+
 #endif /* URL_H */