~cpp #!/usr/bin/env python # Copyright (c) 2005, Google Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following disclaimer # in the documentation and/or other materials provided with the # distribution. # * Neither the name of Google Inc. nor the names of its # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import robotparser import urllib import urllib2 import re import sets import sys import urlparse import win32com.client import time import pywintypes import pythoncom import optparse import getpass import itertools import email.Utils '''A simple web crawler that pushes pages into GDS. Features include: - Knows basic and digest HTTP authentication - Obeys robots.txt - Can loop, recrawling over previously crawled pages every X minutes - When recrawling, uses If-Modified-Since HTTP header to minimize transfers For usage instructions, run with -h flag. Requires Python 2.4 and the win32all extensions for Python 2.4 on Windows. Will not work unless Google Desktop Search 1.0 or later is installed. ''' # Matches URLs in <a href=...> tags. Chosen above htmllib.HTMLParser because # this is much more lenient, not requiring HTML to be valid. _LINK_RE = re.compile(r'<\s*(a|img).+href\s*=\s*"?(.+?)"?(\s|>)', re.MULTILINE | re.IGNORECASE) # Matches <frame src="bla"> tags. _FRAME_RE = re.compile(r'<\s*(frame).+src\s*=\s*"?(.+?)"?(\s|>)', re.MULTILINE | re.IGNORECASE) # Digs out the text of an HTML document's title. _TITLE_RE = re.compile(r'<\s*title.*?>(.+)</\s*title\s*>', re.MULTILINE | re.IGNORECASE) # This plugin's GUID, used to register with GDS. _GUID = '{5e1788fe-a6e6-429f-816c-80cb969028d3}' class NoExceptionHandler(urllib2.BaseHandler): '''An exception handler for HTTP that never throws an exception for various error codes that Kongulo always checks explicitly rather than catching them as exceptions.''' def http_error_304(self, req, fp, code, msg, hdrs): '''We handle not-modified-since explicitly.''' return fp # We check error codes explicitly so we don't want an exception http_error_400 = http_error_401 = http_error_402 = http_error_403 \ = http_error_404 = http_error_304 class PasswordDb(urllib2.HTTPPasswordMgr): '''A very simple password store. The user can supply usernames using the -p flag on the command line, and will be prompted for the password for each username.''' def __init__(self): self.passwords = [] # [ [substring, uid, pw], [substring, uid, pw] ] def Populate(self, options): '''Given an options object as used by Kongulo, ask the user for the password for each user-id/substring-of-domain that the user provided using the -p flag.''' if not options.pw: return for item in options.pw.split(','): (uid, substring) = item.split('@') pw = getpass.getpass('Enter password for %s: ' % item) self.passwords.append([substring, uid, pw]) def find_user_password(self, *args, **kw): for passdata in self.passwords: for name in args: if name.find(passdata[0]) != -1: return (passdata[1], passdata[2]) print "!!! Need login info for (%s @ %s), consider using -p flag" % args return (None, None) passwords = PasswordDb() # A URL opener that can do basic and digest authentication, and never raises # exceptions for HTTP error codes we handle explicitly. opener = urllib2.build_opener(urllib2.HTTPBasicAuthHandler(passwords), urllib2.HTTPDigestAuthHandler(passwords), NoExceptionHandler()) # To be a nice Internet citizen, we identify ourselves properly so that # whoever doesn't like Kongulo can exclude us using robots.txt opener.addheaders = [('User-agent', 'Kongulo v0.1 personal web crawler')] # Should always be true on Windows systems. assert hasattr(opener.handlers[0], 'proxies'), 'ProxyHandler must be first handler.' # This parses Windows proxy registry settings opener.handlers[0].proxies = urllib.getproxies() class LenientRobotParser(robotparser.RobotFileParser): '''Adds ability to parse robot files where same user agent is specified multiple times.''' def __init__(self, url): '''Setup internal state like RobotFileParser does.''' robotparser.RobotFileParser.__init__(self) f = opener.open(url) lines = [] line = f.readline() while line: lines.append(line.strip()) line = f.readline() self.errcode = f.code if self.errcode == 401 or self.errcode == 403: self.disallow_all = 1 elif self.errcode >= 400: self.allow_all = 1 elif self.errcode == 200 and lines: self.parse(lines) def parse(self, lines): """Strip repeated sequential definitions of same user agent, then call base's parse method.""" last_ua = '' modified_lines = [] for line in lines: line if line.lower().startswith('user-agent'): temp = last_ua last_ua = line.lower() if last_ua == temp: continue # skip line if line.strip() == '': last_ua = '' # reset on blank line modified_lines += [line] robotparser.RobotFileParser.parse(self, modified_lines) class UrlValidator: '''An object that handles checking if we should fetch and crawl a specific URL. This is based on the type of the URL (only crawl http URLs) and robot rules. Maintains a cache of robot rules already fetched.''' def __init__(self, match_url): self.robots = {} # Dict of robot URLs to robot parsers self.match_url = re.compile(match_url) def IsCrawlable(self, url): """Returns true if it's OK to crawl the absolute URL provided.""" if not url.startswith('http') or not self.match_url.match(url): return 0 return self.GetRules(url).can_fetch('*', url) def GetRules(self, url): """Returns the robot rules parser for 'url'""" robots_dir = urlparse.urljoin(url, "robots.txt") # First try dir-level if robots_dir in self.robots: return self.robots[robots_dir] robots_site = urlparse.urljoin(url, "/robots.txt") # Then the site-level if robots_site in self.robots: return self.robots[robots_site] # Inv: Our cache contains neither a dir-level nor site-level robots.txt file rules = LenientRobotParser(robots_dir) # First try dir-level if hasattr(rules, 'errcode') and rules.errcode == 200: self.robots[robots_dir] = rules else: rules = LenientRobotParser(robots_site) # Then try site-level self.robots[robots_site] = rules return rules class Crawler: '''This object holds the state of the crawl, and performs the crawl.''' def __init__(self, options): self.options = options # Store the options provided self.rules = UrlValidator(options.match) # Cache of robot rules etc. # Invariant of data: # - 'tocrawl' is a list of items that we have or will crawl. If we have # never crawled them since we started, the item at index 2 in each # crawlitem is None, otherwise it is a dictionary of headers, # specifically the 'If-Modified-Since' header, to prevent us from fetching # this item in the next crawl if it hasn't been modified. # - 'scheduled' is a list of items we have already added to 'tocrawl' # (perhaps a premature optimization since we could just iterate over # 'tocrawl') self.scheduled = sets.Set() # Format of this list is: # [[url1, depth1, { headername : headerval, ... } ], [url2, depth2], {}...] self.tocrawl = [] # Fetch the entrypoint to the Google Desktop Search API. def ExtractLinks(self, baseurl, htmldoc): """Returns all anchors from the document with contents 'htmldoc' at 'baseurl' that are OK to crawl.""" urls = [] for match in itertools.chain(_LINK_RE.finditer(htmldoc), _FRAME_RE.finditer(htmldoc)): url = urlparse.urljoin(baseurl, match.group(2)) if self.rules.IsCrawlable(url): urls += [url] else: print " I %s" % url return urls def Crawl(self, baseurls): '''Performs the crawl. Args: baseurls: [url1, url2, ...] ''' # Bootstrap our invariant of data for baseurl in baseurls: self.tocrawl.append([baseurl, self.options.depth, None]) if self.options.loop: print "Running in loop mode - press Ctrl-C to stop." while True: for crawlitem in self.tocrawl: (url, depth, headers) = crawlitem try: if headers: doc = opener.open(urllib2.Request(url, headers=headers)) else: doc = opener.open(url) doctype = doc.info().type if doc.code == 304: # not modified since last time print "--- (nomod) %s" % url elif (doc.code == 200 and doctype == 'text/html' or doctype == 'text/plain'): print "::: (%d) %s" % (depth, url) # Store last modified in the crawlitem # Prefer Last-Modified header, then Date header (to get same # formatting as used by the server), then current date in # appropriate format. last_modified = None if 'last_modified' in doc.headers: last_modified = fdoc.headers['last_modified'] elif 'date' in doc.headers: last_modified = doc.headers['date'] else: last_modified = email.Utils.formatdate(time.time(), usegmt=True) crawlitem[2] = { 'If-Modified-Since' : last_modified } content = doc.read() # Create a GDS event, populate its fields, and send it off to have # the web page added to the Google Desktop Search index. #event.AddProperty('format', doctype) #event.AddProperty('content', content) #event.AddProperty('uri', url) fout = file("output.txt",'w'); print >> fout,"\n****doctype********\n" print >> fout, doctype print >> fout,"\n****content********\n" print >> fout, content print >> fout,"\n******url******\n" print >> fout, url # TODO Use the last-modified HTTP header instead of current time # if available. #event.AddProperty('last_modified_time', # pywintypes.Time(time.time() + time.timezone)) print >> fout,"\n*****time*********\n" print >> fout, pywintypes.Time(time.time() + time.timezone) if doctype == 'text/html': # no links in text documents title_match = _TITLE_RE.search(content) if title_match: title = title_match.group(1) #event.AddProperty('title', title) print >> fout, "\n*****title*******\n" print >> fout, title for link in self.ExtractLinks(doc.geturl(), content): if depth > 0 and not link in self.scheduled: self.scheduled.add(link) self.tocrawl.append([link, depth - 1, None]) # Don't use historical flag, because if we do, GDS will "throttle" # the events we send, not returning until the user becomes idle. # We also want to ensure the page is updated in the cache (in case # the user already visited it herself using a browser). #event.Send(0x01) else: print "!!! (HTTP %d) %s" % (doc.code, url) doc.close() except IOError: print "!!! (nolink) %s" % url except ValueError: print "!!! (noauth) %s" % url if not self.options.loop: break else: print ("=== Completed crawl; will recrawl in %d minutes." % (self.options.sleep)) time.sleep(60 * self.options.sleep) def Main(): '''This function contains the logic for the command-line UI for Kongulo.''' # Set up options and parse arguments. parser = optparse.OptionParser(usage='%prog [options] BASEURL1 BASEURL2 ...') parser.add_option('-d', '--depth', type='int', dest='depth', default=0, help='How deep to follow links from BASEURLs (default 0, ' 'suggest max 5-6)') parser.add_option('-m', '--match', dest='match', default='.+', help=r'Regular expression that URLs must match if they are ' 'to be crawled, e.g. ".+intranet\.smurfgeburf\.com.+" to ' 'stay within the Smurfgeburf intranet') parser.add_option('-l', '--loop', action='store_true', dest='loop', default=False, help='If this flag is given, Kongulo will ' 'keep fetching the specified page and pages it points to. ' 'It will not refetch pages that haven't changed.') parser.add_option('-s', '--sleep', type='int', dest='sleep', default=60, help='Number of minutes to sleep before looping (default ' '60). Only valid if -l is also specified.') parser.add_option('-p', '--passwords', dest='pw', help='Comma-delimited list of user IDs at names that will ' 'be matched as substrings against the domain or "region" ' 'that a password is needed for, e.g. ' '"joi@google.com,admin@192.168.250.1,snafu@slashdot.org". ' 'You will be prompted for each password.') (options, args) = parser.parse_args() if len(args) < 1: parser.error('Provide at least one base URL') # try: # obj = win32com.client.Dispatch('GoogleDesktopSearch.Register') # except pythoncom.ole_error: # print ('ERROR: You need to install Google Desktop Search to be able to ' # 'use Kongulo.') # sys.exit(2) # try: # Register with GDS. This is a one-time operation and will return an # error if already registered. We cheat and just catch the error and # do nothing. # obj.RegisterComponent(_GUID, # ['Title', 'Kongulo', 'Description', 'A simple web spider that ' # 'lets you keep copies of web sites in your Google Desktop Search ' # 'index.', 'Icon', '%SystemRoot%\system32\SHELL32.dll,134']) # TODO Provide an unregistration mechanism. # except: # TODO narrow to only the error that GDS returns when component # already registered # pass passwords.Populate(options) Crawler(options).Crawl(args) if __name__ == '__main__': Main()