Add feature that allows to ensure that Wget correctly crawls the website in recursive mode

This commit is contained in:
Darshit Shah 2013-09-14 18:29:58 +05:30
parent 0758f47954
commit b703633715
4 changed files with 66 additions and 5 deletions

View File

@ -1,3 +1,22 @@
2013-09-14 Darshit Shah <darnir@gmail.com>
* HTTPServer.py (StoppableHTTPServer): Define object variable
request_headers which stores a list of requests received by the server
(StoppableHTTPServer.get_req_headers): Return the list of Request
Headers stored by the server
(_Handler.do_HEAD): Send the Request MEthod string for identification
(_Handler.do_GET): Same
(_Handler.__log_request): Log the request in Request_Headers list
(_Handler.send_head): Make a call to __log_request
* Test--spider-r.py: Add new list, Request_List, which contains all
the requests that Wget is expected to send. This will allow for
fine-grained tests on recursive downloading.
* WgetTest.py (CommonMethods.FilesCrawled): New Post-Test Hook, that
ensures that all the expected Files on the server were accessed as
expected.
(HTTPTest.stop_HTTP_server): On stopping server, asks it to respond
with list of all requests it received.
2013-09-13 Darshit Shah <darnir@gmail.com>
* Test--spider-r.py: Test retrieval in recursive spider mode.

View File

@ -22,6 +22,8 @@ class ServerError (Exception):
class StoppableHTTPServer (HTTPServer):
request_headers = list ()
""" Define methods for configuring the Server. """
def server_conf (self, filelist, conf_dict):
@ -29,6 +31,9 @@ class StoppableHTTPServer (HTTPServer):
self.server_configs = conf_dict
self.fileSys = filelist
def get_req_headers (self):
return self.request_headers
class WgetHTTPRequestHandler (BaseHTTPRequestHandler):
@ -49,10 +54,10 @@ class _Handler (WgetHTTPRequestHandler):
""" Define functions for various HTTP Requests. """
def do_HEAD (self):
self.send_head ()
self.send_head ("HEAD")
def do_GET (self):
content, start = self.send_head ()
content, start = self.send_head ("GET")
if content:
if start is None:
self.wfile.write (content.encode ('utf-8'))
@ -325,9 +330,17 @@ class _Handler (WgetHTTPRequestHandler):
return False
return True
def send_head (self):
def __log_request (self, method):
req = method + " " + self.path
self.server.request_headers.append (req)
def send_head (self, method):
""" Common code for GET and HEAD Commands.
This method is overriden to use the fileSys dict.
The method variable contains whether this was a HEAD or a GET Request.
According to RFC 2616, the server should not differentiate between
the two requests, however, we use it here for a specific test.
"""
if self.path == "/":
@ -335,6 +348,8 @@ class _Handler (WgetHTTPRequestHandler):
else:
path = self.path[1:]
self.__log_request (method)
if path in self.server.fileSys:
self.rules = self.server.server_configs.get (path)

View File

@ -58,6 +58,21 @@ secondpage_html = WgetFile ("secondpage.html", secondpage)
thirdpage_html = WgetFile ("thirdpage.html", thirdpage)
dummy_txt = WgetFile ("dummy.txt", dummyfile)
Request_List = [
[
"HEAD /",
"GET /",
"GET /robots.txt",
"HEAD /secondpage.html",
"GET /secondpage.html",
"HEAD /nonexistent",
"HEAD /thirdpage.html",
"GET /thirdpage.html",
"HEAD /dummy.txt",
"HEAD /againnonexistent"
]
]
WGET_OPTIONS = "-d --spider -r"
WGET_URLS = [[""]]
@ -76,7 +91,8 @@ test_options = {
}
post_test = {
"ExpectedFiles" : ExpectedDownloadedFiles,
"ExpectedRetcode" : ExpectedReturnCode
"ExpectedRetcode" : ExpectedReturnCode,
"FilesCrawled" : Request_List
}
err = HTTPTest (

View File

@ -168,7 +168,6 @@ class CommonMethods:
server_rules[file_obj.name] = rule_obj
self.server_list[i].server_conf (file_list, server_rules)
def LocalFiles (self, local_files):
for file_obj in local_files:
file_handler = open (file_obj.name, "w")
@ -193,6 +192,15 @@ class CommonMethods:
def ExpectedFiles (self, exp_filesys):
self.__check_downloaded_files (exp_filesys)
def FilesCrawled (self, Request_Headers):
for i in range (0, self.servers):
headers = set(Request_Headers[i])
o_headers = self.Request_remaining[i]
header_diff = headers.symmetric_difference (o_headers)
if len(header_diff) is not 0:
printer ("RED", str (header_diff))
raise TestFailed ("Not all files were crawled correctly")
""" Class for HTTP Tests. """
@ -271,7 +279,10 @@ class HTTPTest (CommonMethods):
return server
def stop_HTTP_Server (self):
self.Request_remaining = list ()
for server in self.server_list:
server_req = server.server_inst.get_req_headers ()
self.Request_remaining.append (server_req)
server.server_inst.shutdown ()
""" WgetFile is a File Data Container object """