From b70363371565a8780e013bba275e0cad4196b90f Mon Sep 17 00:00:00 2001 From: Darshit Shah Date: Sat, 14 Sep 2013 18:29:58 +0530 Subject: [PATCH] Add feature that allows to ensure that Wget correctly crawls the website in recursive mode --- testenv/ChangeLog | 19 +++++++++++++++++++ testenv/HTTPServer.py | 21 ++++++++++++++++++--- testenv/Test--spider-r.py | 18 +++++++++++++++++- testenv/WgetTest.py | 13 ++++++++++++- 4 files changed, 66 insertions(+), 5 deletions(-) diff --git a/testenv/ChangeLog b/testenv/ChangeLog index b718f398..23657eef 100644 --- a/testenv/ChangeLog +++ b/testenv/ChangeLog @@ -1,3 +1,22 @@ +2013-09-14 Darshit Shah + + * HTTPServer.py (StoppableHTTPServer): Define object variable + request_headers which stores a list of requests received by the server + (StoppableHTTPServer.get_req_headers): Return the list of Request + Headers stored by the server + (_Handler.do_HEAD): Send the Request MEthod string for identification + (_Handler.do_GET): Same + (_Handler.__log_request): Log the request in Request_Headers list + (_Handler.send_head): Make a call to __log_request + * Test--spider-r.py: Add new list, Request_List, which contains all + the requests that Wget is expected to send. This will allow for + fine-grained tests on recursive downloading. + * WgetTest.py (CommonMethods.FilesCrawled): New Post-Test Hook, that + ensures that all the expected Files on the server were accessed as + expected. + (HTTPTest.stop_HTTP_server): On stopping server, asks it to respond + with list of all requests it received. + 2013-09-13 Darshit Shah * Test--spider-r.py: Test retrieval in recursive spider mode. diff --git a/testenv/HTTPServer.py b/testenv/HTTPServer.py index e1268ec1..aab8d4f5 100644 --- a/testenv/HTTPServer.py +++ b/testenv/HTTPServer.py @@ -22,6 +22,8 @@ class ServerError (Exception): class StoppableHTTPServer (HTTPServer): + request_headers = list () + """ Define methods for configuring the Server. """ def server_conf (self, filelist, conf_dict): @@ -29,6 +31,9 @@ class StoppableHTTPServer (HTTPServer): self.server_configs = conf_dict self.fileSys = filelist + def get_req_headers (self): + return self.request_headers + class WgetHTTPRequestHandler (BaseHTTPRequestHandler): @@ -49,10 +54,10 @@ class _Handler (WgetHTTPRequestHandler): """ Define functions for various HTTP Requests. """ def do_HEAD (self): - self.send_head () + self.send_head ("HEAD") def do_GET (self): - content, start = self.send_head () + content, start = self.send_head ("GET") if content: if start is None: self.wfile.write (content.encode ('utf-8')) @@ -325,9 +330,17 @@ class _Handler (WgetHTTPRequestHandler): return False return True - def send_head (self): + def __log_request (self, method): + req = method + " " + self.path + self.server.request_headers.append (req) + + def send_head (self, method): """ Common code for GET and HEAD Commands. This method is overriden to use the fileSys dict. + + The method variable contains whether this was a HEAD or a GET Request. + According to RFC 2616, the server should not differentiate between + the two requests, however, we use it here for a specific test. """ if self.path == "/": @@ -335,6 +348,8 @@ class _Handler (WgetHTTPRequestHandler): else: path = self.path[1:] + self.__log_request (method) + if path in self.server.fileSys: self.rules = self.server.server_configs.get (path) diff --git a/testenv/Test--spider-r.py b/testenv/Test--spider-r.py index b52db870..b770a9f2 100755 --- a/testenv/Test--spider-r.py +++ b/testenv/Test--spider-r.py @@ -58,6 +58,21 @@ secondpage_html = WgetFile ("secondpage.html", secondpage) thirdpage_html = WgetFile ("thirdpage.html", thirdpage) dummy_txt = WgetFile ("dummy.txt", dummyfile) +Request_List = [ + [ + "HEAD /", + "GET /", + "GET /robots.txt", + "HEAD /secondpage.html", + "GET /secondpage.html", + "HEAD /nonexistent", + "HEAD /thirdpage.html", + "GET /thirdpage.html", + "HEAD /dummy.txt", + "HEAD /againnonexistent" + ] +] + WGET_OPTIONS = "-d --spider -r" WGET_URLS = [[""]] @@ -76,7 +91,8 @@ test_options = { } post_test = { "ExpectedFiles" : ExpectedDownloadedFiles, - "ExpectedRetcode" : ExpectedReturnCode + "ExpectedRetcode" : ExpectedReturnCode, + "FilesCrawled" : Request_List } err = HTTPTest ( diff --git a/testenv/WgetTest.py b/testenv/WgetTest.py index f94823ea..9e5d2fe0 100644 --- a/testenv/WgetTest.py +++ b/testenv/WgetTest.py @@ -168,7 +168,6 @@ class CommonMethods: server_rules[file_obj.name] = rule_obj self.server_list[i].server_conf (file_list, server_rules) - def LocalFiles (self, local_files): for file_obj in local_files: file_handler = open (file_obj.name, "w") @@ -193,6 +192,15 @@ class CommonMethods: def ExpectedFiles (self, exp_filesys): self.__check_downloaded_files (exp_filesys) + def FilesCrawled (self, Request_Headers): + for i in range (0, self.servers): + headers = set(Request_Headers[i]) + o_headers = self.Request_remaining[i] + header_diff = headers.symmetric_difference (o_headers) + if len(header_diff) is not 0: + printer ("RED", str (header_diff)) + raise TestFailed ("Not all files were crawled correctly") + """ Class for HTTP Tests. """ @@ -271,7 +279,10 @@ class HTTPTest (CommonMethods): return server def stop_HTTP_Server (self): + self.Request_remaining = list () for server in self.server_list: + server_req = server.server_inst.get_req_headers () + self.Request_remaining.append (server_req) server.server_inst.shutdown () """ WgetFile is a File Data Container object """