diff options
Diffstat (limited to 'modules/language/python/module/urllib/robotparser.py')
-rw-r--r-- | modules/language/python/module/urllib/robotparser.py | 25 |
1 files changed, 13 insertions, 12 deletions
diff --git a/modules/language/python/module/urllib/robotparser.py b/modules/language/python/module/urllib/robotparser.py index f110d80..cde8b47 100644 --- a/modules/language/python/module/urllib/robotparser.py +++ b/modules/language/python/module/urllib/robotparser.py @@ -1,4 +1,4 @@ -module(urllib.robotparser) +module(urllib,robotparser) """ robotparser.py @@ -13,8 +13,9 @@ module(urllib.robotparser) """ import collections -import urllib.parse -import urllib.request +import urllib.parse as uparse +import urllib.error as error +import urllib.request as request __all__ = ["RobotFileParser"] @@ -55,13 +56,13 @@ class RobotFileParser: def set_url(self, url): """Sets the URL referring to a robots.txt file.""" self.url = url - self.host, self.path = urllib.parse.urlparse(url)[1:3] + self.host, self.path = uparse.urlparse(url)[1:3] def read(self): """Reads the robots.txt URL and feeds it to the parser.""" try: - f = urllib.request.urlopen(self.url) - except urllib.error.HTTPError as err: + f = request.urlopen(self.url) + except error.HTTPError as err: if err.code in (401, 403): self.disallow_all = True elif err.code >= 400 and err.code < 500: @@ -112,7 +113,7 @@ class RobotFileParser: line = line.split(':', 1) if len(line) == 2: line[0] = line[0].strip().lower() - line[1] = urllib.parse.unquote(line[1].strip()) + line[1] = uparse.unquote(line[1].strip()) if line[0] == "user-agent": if state == 2: self._add_entry(entry) @@ -160,10 +161,10 @@ class RobotFileParser: return False # search for given user agent matches # the first match counts - parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url)) - url = urllib.parse.urlunparse(('','',parsed_url.path, + parsed_url = uparse.urlparse(uparse.unquote(url)) + url = uparse.urlunparse(('','',parsed_url.path, parsed_url.params,parsed_url.query, parsed_url.fragment)) - url = urllib.parse.quote(url) + url = uparse.quote(url) if not url: url = "/" for entry in self.entries: @@ -202,8 +203,8 @@ class RuleLine: if path == '' and not allowance: # an empty value means allow all allowance = True - path = urllib.parse.urlunparse(urllib.parse.urlparse(path)) - self.path = urllib.parse.quote(path) + path = uparse.urlunparse(uparse.urlparse(path)) + self.path = uparse.quote(path) self.allowance = allowance def applies_to(self, filename): |