Add feature that allows to ensure that Wget correctly crawls the website in recursive mode

2025-04-20 16:30:21 +08:00 · 2013-09-14 18:29:58 +05:30 · 2013-09-14 18:29:58 +05:30 · b703633715
commit b703633715
parent 0758f47954
4 changed files with 66 additions and 5 deletions
--- a/testenv/ChangeLog
+++ b/testenv/ChangeLog
@ -1,3 +1,22 @@
+2013-09-14  Darshit Shah  <darnir@gmail.com>
+
+	* HTTPServer.py (StoppableHTTPServer): Define object variable
+	request_headers which stores a list of requests received by the server
+	(StoppableHTTPServer.get_req_headers): Return the list of Request
+	Headers stored by the server
+	(_Handler.do_HEAD): Send the Request MEthod string for identification
+	(_Handler.do_GET): Same
+	(_Handler.__log_request): Log the request in Request_Headers list
+	(_Handler.send_head): Make a call to __log_request
+	* Test--spider-r.py: Add new list, Request_List, which contains all
+	the requests that Wget is expected to send. This will allow for
+	fine-grained tests on recursive downloading.
+	* WgetTest.py (CommonMethods.FilesCrawled): New Post-Test Hook, that
+	ensures that all the expected Files on the server were accessed as
+	expected.
+	(HTTPTest.stop_HTTP_server): On stopping server, asks it to respond
+	with list of all requests it received.
+
 2013-09-13  Darshit Shah  <darnir@gmail.com>

 	* Test--spider-r.py: Test retrieval in recursive spider mode.
--- a/testenv/HTTPServer.py
+++ b/testenv/HTTPServer.py
@ -22,6 +22,8 @@ class ServerError (Exception):

 class StoppableHTTPServer (HTTPServer):

+    request_headers = list ()
+
    """ Define methods for configuring the Server. """

    def server_conf (self, filelist, conf_dict):
@ -29,6 +31,9 @@ class StoppableHTTPServer (HTTPServer):
        self.server_configs = conf_dict
        self.fileSys = filelist

+    def get_req_headers (self):
+        return self.request_headers
+

 class WgetHTTPRequestHandler (BaseHTTPRequestHandler):

@ -49,10 +54,10 @@ class _Handler (WgetHTTPRequestHandler):
    """ Define functions for various HTTP Requests. """

    def do_HEAD (self):
-        self.send_head ()
+        self.send_head ("HEAD")

    def do_GET (self):
-        content, start = self.send_head ()
+        content, start = self.send_head ("GET")
        if content:
            if start is None:
                self.wfile.write (content.encode ('utf-8'))
@ -325,9 +330,17 @@ class _Handler (WgetHTTPRequestHandler):
                    return False
        return True

-    def send_head (self):
+    def __log_request (self, method):
+        req = method + " " + self.path
+        self.server.request_headers.append (req)
+
+    def send_head (self, method):
        """ Common code for GET and HEAD Commands.
        This method is overriden to use the fileSys dict.
+
+        The method variable contains whether this was a HEAD or a GET Request.
+        According to RFC 2616, the server should not differentiate between
+        the two requests, however, we use it here for a specific test.
        """

        if self.path == "/":
@ -335,6 +348,8 @@ class _Handler (WgetHTTPRequestHandler):
        else:
            path = self.path[1:]

+        self.__log_request (method)
+
        if path in self.server.fileSys:
            self.rules = self.server.server_configs.get (path)

--- a/testenv/Test--spider-r.py
+++ b/testenv/Test--spider-r.py
@ -58,6 +58,21 @@ secondpage_html = WgetFile ("secondpage.html", secondpage)
 thirdpage_html = WgetFile ("thirdpage.html", thirdpage)
 dummy_txt = WgetFile ("dummy.txt", dummyfile)

+Request_List = [
+    [
+        "HEAD /",
+        "GET /",
+        "GET /robots.txt",
+        "HEAD /secondpage.html",
+        "GET /secondpage.html",
+        "HEAD /nonexistent",
+        "HEAD /thirdpage.html",
+        "GET /thirdpage.html",
+        "HEAD /dummy.txt",
+        "HEAD /againnonexistent"
+    ]
+]
+
 WGET_OPTIONS = "-d --spider -r"
 WGET_URLS = [[""]]

@ -76,7 +91,8 @@ test_options = {
 }
 post_test = {
    "ExpectedFiles"     : ExpectedDownloadedFiles,
-    "ExpectedRetcode"   : ExpectedReturnCode
+    "ExpectedRetcode"   : ExpectedReturnCode,
+    "FilesCrawled"      : Request_List
 }

 err = HTTPTest (
--- a/testenv/WgetTest.py
+++ b/testenv/WgetTest.py
@ -168,7 +168,6 @@ class CommonMethods:
                server_rules[file_obj.name] = rule_obj
            self.server_list[i].server_conf (file_list, server_rules)

-
    def LocalFiles (self, local_files):
        for file_obj in local_files:
            file_handler = open (file_obj.name, "w")
@ -193,6 +192,15 @@ class CommonMethods:
    def ExpectedFiles (self, exp_filesys):
        self.__check_downloaded_files (exp_filesys)

+    def FilesCrawled (self, Request_Headers):
+        for i in range (0, self.servers):
+            headers = set(Request_Headers[i])
+            o_headers = self.Request_remaining[i]
+            header_diff = headers.symmetric_difference (o_headers)
+            if len(header_diff) is not 0:
+                printer ("RED", str (header_diff))
+                raise TestFailed ("Not all files were crawled correctly")
+

 """ Class for HTTP Tests. """

@ -271,7 +279,10 @@ class HTTPTest (CommonMethods):
        return server

    def stop_HTTP_Server (self):
+        self.Request_remaining = list ()
        for server in self.server_list:
+            server_req = server.server_inst.get_req_headers ()
+            self.Request_remaining.append (server_req)
            server.server_inst.shutdown ()

 """ WgetFile is a File Data Container object """