mirror of
https://github.com/mirror/wget.git
synced 2025-02-05 01:01:00 +08:00
Add feature that allows to ensure that Wget correctly crawls the website in recursive mode
This commit is contained in:
parent
0758f47954
commit
b703633715
@ -1,3 +1,22 @@
|
||||
2013-09-14 Darshit Shah <darnir@gmail.com>
|
||||
|
||||
* HTTPServer.py (StoppableHTTPServer): Define object variable
|
||||
request_headers which stores a list of requests received by the server
|
||||
(StoppableHTTPServer.get_req_headers): Return the list of Request
|
||||
Headers stored by the server
|
||||
(_Handler.do_HEAD): Send the Request MEthod string for identification
|
||||
(_Handler.do_GET): Same
|
||||
(_Handler.__log_request): Log the request in Request_Headers list
|
||||
(_Handler.send_head): Make a call to __log_request
|
||||
* Test--spider-r.py: Add new list, Request_List, which contains all
|
||||
the requests that Wget is expected to send. This will allow for
|
||||
fine-grained tests on recursive downloading.
|
||||
* WgetTest.py (CommonMethods.FilesCrawled): New Post-Test Hook, that
|
||||
ensures that all the expected Files on the server were accessed as
|
||||
expected.
|
||||
(HTTPTest.stop_HTTP_server): On stopping server, asks it to respond
|
||||
with list of all requests it received.
|
||||
|
||||
2013-09-13 Darshit Shah <darnir@gmail.com>
|
||||
|
||||
* Test--spider-r.py: Test retrieval in recursive spider mode.
|
||||
|
@ -22,6 +22,8 @@ class ServerError (Exception):
|
||||
|
||||
class StoppableHTTPServer (HTTPServer):
|
||||
|
||||
request_headers = list ()
|
||||
|
||||
""" Define methods for configuring the Server. """
|
||||
|
||||
def server_conf (self, filelist, conf_dict):
|
||||
@ -29,6 +31,9 @@ class StoppableHTTPServer (HTTPServer):
|
||||
self.server_configs = conf_dict
|
||||
self.fileSys = filelist
|
||||
|
||||
def get_req_headers (self):
|
||||
return self.request_headers
|
||||
|
||||
|
||||
class WgetHTTPRequestHandler (BaseHTTPRequestHandler):
|
||||
|
||||
@ -49,10 +54,10 @@ class _Handler (WgetHTTPRequestHandler):
|
||||
""" Define functions for various HTTP Requests. """
|
||||
|
||||
def do_HEAD (self):
|
||||
self.send_head ()
|
||||
self.send_head ("HEAD")
|
||||
|
||||
def do_GET (self):
|
||||
content, start = self.send_head ()
|
||||
content, start = self.send_head ("GET")
|
||||
if content:
|
||||
if start is None:
|
||||
self.wfile.write (content.encode ('utf-8'))
|
||||
@ -325,9 +330,17 @@ class _Handler (WgetHTTPRequestHandler):
|
||||
return False
|
||||
return True
|
||||
|
||||
def send_head (self):
|
||||
def __log_request (self, method):
|
||||
req = method + " " + self.path
|
||||
self.server.request_headers.append (req)
|
||||
|
||||
def send_head (self, method):
|
||||
""" Common code for GET and HEAD Commands.
|
||||
This method is overriden to use the fileSys dict.
|
||||
|
||||
The method variable contains whether this was a HEAD or a GET Request.
|
||||
According to RFC 2616, the server should not differentiate between
|
||||
the two requests, however, we use it here for a specific test.
|
||||
"""
|
||||
|
||||
if self.path == "/":
|
||||
@ -335,6 +348,8 @@ class _Handler (WgetHTTPRequestHandler):
|
||||
else:
|
||||
path = self.path[1:]
|
||||
|
||||
self.__log_request (method)
|
||||
|
||||
if path in self.server.fileSys:
|
||||
self.rules = self.server.server_configs.get (path)
|
||||
|
||||
|
@ -58,6 +58,21 @@ secondpage_html = WgetFile ("secondpage.html", secondpage)
|
||||
thirdpage_html = WgetFile ("thirdpage.html", thirdpage)
|
||||
dummy_txt = WgetFile ("dummy.txt", dummyfile)
|
||||
|
||||
Request_List = [
|
||||
[
|
||||
"HEAD /",
|
||||
"GET /",
|
||||
"GET /robots.txt",
|
||||
"HEAD /secondpage.html",
|
||||
"GET /secondpage.html",
|
||||
"HEAD /nonexistent",
|
||||
"HEAD /thirdpage.html",
|
||||
"GET /thirdpage.html",
|
||||
"HEAD /dummy.txt",
|
||||
"HEAD /againnonexistent"
|
||||
]
|
||||
]
|
||||
|
||||
WGET_OPTIONS = "-d --spider -r"
|
||||
WGET_URLS = [[""]]
|
||||
|
||||
@ -76,7 +91,8 @@ test_options = {
|
||||
}
|
||||
post_test = {
|
||||
"ExpectedFiles" : ExpectedDownloadedFiles,
|
||||
"ExpectedRetcode" : ExpectedReturnCode
|
||||
"ExpectedRetcode" : ExpectedReturnCode,
|
||||
"FilesCrawled" : Request_List
|
||||
}
|
||||
|
||||
err = HTTPTest (
|
||||
|
@ -168,7 +168,6 @@ class CommonMethods:
|
||||
server_rules[file_obj.name] = rule_obj
|
||||
self.server_list[i].server_conf (file_list, server_rules)
|
||||
|
||||
|
||||
def LocalFiles (self, local_files):
|
||||
for file_obj in local_files:
|
||||
file_handler = open (file_obj.name, "w")
|
||||
@ -193,6 +192,15 @@ class CommonMethods:
|
||||
def ExpectedFiles (self, exp_filesys):
|
||||
self.__check_downloaded_files (exp_filesys)
|
||||
|
||||
def FilesCrawled (self, Request_Headers):
|
||||
for i in range (0, self.servers):
|
||||
headers = set(Request_Headers[i])
|
||||
o_headers = self.Request_remaining[i]
|
||||
header_diff = headers.symmetric_difference (o_headers)
|
||||
if len(header_diff) is not 0:
|
||||
printer ("RED", str (header_diff))
|
||||
raise TestFailed ("Not all files were crawled correctly")
|
||||
|
||||
|
||||
""" Class for HTTP Tests. """
|
||||
|
||||
@ -271,7 +279,10 @@ class HTTPTest (CommonMethods):
|
||||
return server
|
||||
|
||||
def stop_HTTP_Server (self):
|
||||
self.Request_remaining = list ()
|
||||
for server in self.server_list:
|
||||
server_req = server.server_inst.get_req_headers ()
|
||||
self.Request_remaining.append (server_req)
|
||||
server.server_inst.shutdown ()
|
||||
|
||||
""" WgetFile is a File Data Container object """
|
||||
|
Loading…
Reference in New Issue
Block a user