Kongulo

~cpp
#!/usr/bin/env python

# Copyright (c) 2005, Google Inc.
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# 
#     * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following disclaimer
# in the documentation and/or other materials provided with the
# distribution.
#     * Neither the name of Google Inc. nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import robotparser
import urllib
import urllib2
import re
import sets
import sys
import urlparse
import win32com.client
import time
import pywintypes
import pythoncom
import optparse
import getpass
import itertools
import email.Utils

'''A simple web crawler that pushes pages into GDS.  Features include:
  - Knows basic and digest HTTP authentication
  - Obeys robots.txt
  - Can loop, recrawling over previously crawled pages every X minutes
  - When recrawling, uses If-Modified-Since HTTP header to minimize transfers

For usage instructions, run with -h flag.

Requires Python 2.4 and the win32all extensions for Python 2.4 on Windows.
Will not work unless Google Desktop Search 1.0 or later is installed.
'''

# Matches URLs in <a href=...> tags.  Chosen above htmllib.HTMLParser because
# this is much more lenient, not requiring HTML to be valid.
_LINK_RE = re.compile(r'<\s*(a|img).+href\s*=\s*"?(.+?)"?(\s|>)',
                      re.MULTILINE | re.IGNORECASE)


# Matches <frame src="bla"> tags.
_FRAME_RE = re.compile(r'<\s*(frame).+src\s*=\s*"?(.+?)"?(\s|>)',
                       re.MULTILINE | re.IGNORECASE)


# Digs out the text of an HTML document's title.
_TITLE_RE = re.compile(r'<\s*title.*?>(.+)</\s*title\s*>',
                       re.MULTILINE | re.IGNORECASE)


# This plugin's GUID, used to register with GDS.
_GUID = '{5e1788fe-a6e6-429f-816c-80cb969028d3}'


class NoExceptionHandler(urllib2.BaseHandler):
  '''An exception handler for HTTP that never throws an exception for various
  error codes that Kongulo always checks explicitly rather than catching them
  as exceptions.'''
  def http_error_304(self, req, fp, code, msg, hdrs):
    '''We handle not-modified-since explicitly.'''
    return fp
  
  # We check error codes explicitly so we don't want an exception
  http_error_400 = http_error_401 = http_error_402 = http_error_403 \
  = http_error_404 = http_error_304


class PasswordDb(urllib2.HTTPPasswordMgr):
  '''A very simple password store.  The user can supply usernames using the
  -p flag on the command line, and will be prompted for the password for
  each username.'''
  
  def __init__(self):
    self.passwords = []  # [ [substring, uid, pw], [substring, uid, pw] ]
  
  def Populate(self, options):
    '''Given an options object as used by Kongulo, ask the user for the
    password for each user-id/substring-of-domain that the user provided using
    the -p flag.'''
    if not options.pw:
      return
    
    for item in options.pw.split(','):
      (uid, substring) = item.split('@')
      pw = getpass.getpass('Enter password for %s: ' % item)
      self.passwords.append([substring, uid, pw])
    
  def find_user_password(self, *args, **kw):
    for passdata in self.passwords:
      for name in args:
        if name.find(passdata[0]) != -1:
          return (passdata[1], passdata[2])
    print "!!! Need login info for (%s @ %s), consider using -p flag" % args
    return (None, None)

passwords = PasswordDb()

# A URL opener that can do basic and digest authentication, and never raises
# exceptions for HTTP error codes we handle explicitly.
opener = urllib2.build_opener(urllib2.HTTPBasicAuthHandler(passwords),
                              urllib2.HTTPDigestAuthHandler(passwords),
                              NoExceptionHandler())

# To be a nice Internet citizen, we identify ourselves properly so that
# whoever doesn't like Kongulo can exclude us using robots.txt
opener.addheaders = [('User-agent', 'Kongulo v0.1 personal web crawler')]

# Should always be true on Windows systems.
assert hasattr(opener.handlers[0],
               'proxies'), 'ProxyHandler must be first handler.'
# This parses Windows proxy registry settings
opener.handlers[0].proxies = urllib.getproxies()

class LenientRobotParser(robotparser.RobotFileParser):
  '''Adds ability to parse robot files where same user agent is specified
  multiple times.'''

  def __init__(self, url):
    '''Setup internal state like RobotFileParser does.'''
    robotparser.RobotFileParser.__init__(self)
    f = opener.open(url)
    lines = []
    line = f.readline()
    while line:
        lines.append(line.strip())
        line = f.readline()
    self.errcode = f.code
    if self.errcode == 401 or self.errcode == 403:
        self.disallow_all = 1
    elif self.errcode >= 400:
        self.allow_all = 1
    elif self.errcode == 200 and lines:
        self.parse(lines)
    
  def parse(self, lines):
    """Strip repeated sequential definitions of same user agent, then
    call base's parse method."""
    last_ua = ''
    modified_lines = []
    for line in lines:
      line
      if line.lower().startswith('user-agent'):
        temp = last_ua
        last_ua = line.lower()
        if last_ua == temp:
          continue  # skip line
      if line.strip() == '':
        last_ua = ''  # reset on blank line
      modified_lines += [line]
    
    robotparser.RobotFileParser.parse(self, modified_lines)


class UrlValidator:
  '''An object that handles checking if we should fetch and crawl a specific
  URL.  This is based on the type of the URL (only crawl http URLs) and robot
  rules.  Maintains a cache of robot rules already fetched.'''
  
  def __init__(self, match_url):
    self.robots = {}  # Dict of robot URLs to robot parsers
    self.match_url = re.compile(match_url)
  
  def IsCrawlable(self, url):
    """Returns true if it's OK to crawl the absolute URL provided."""
    if not url.startswith('http') or not self.match_url.match(url):
      return 0
    return self.GetRules(url).can_fetch('*', url)
  
  def GetRules(self, url):
    """Returns the robot rules parser for 'url'"""
    robots_dir = urlparse.urljoin(url, "robots.txt")  # First try dir-level
    if robots_dir in self.robots:
      return self.robots[robots_dir]
    robots_site = urlparse.urljoin(url, "/robots.txt")  # Then the site-level
    if robots_site in self.robots:
      return self.robots[robots_site]
    
    # Inv: Our cache contains neither a dir-level nor site-level robots.txt file
    
    rules = LenientRobotParser(robots_dir)  # First try dir-level
    if hasattr(rules, 'errcode') and rules.errcode == 200:
      self.robots[robots_dir] = rules
    else:
      rules = LenientRobotParser(robots_site)  # Then try site-level
      self.robots[robots_site] = rules
    
    return rules


class Crawler:
  '''This object holds the state of the crawl, and performs the crawl.'''
  
  def __init__(self, options):    
    self.options = options  # Store the options provided
    self.rules = UrlValidator(options.match)  # Cache of robot rules etc.
    
    # Invariant of data:
    # - 'tocrawl' is a list of items that we have or will crawl.  If we have
    #   never crawled them since we started, the item at index 2 in each
    #   crawlitem is None, otherwise it is a dictionary of headers,
    #   specifically the 'If-Modified-Since' header, to prevent us from fetching
    #   this item in the next crawl if it hasn't been modified.
    # - 'scheduled' is a list of items we have already added to 'tocrawl'
    #   (perhaps a premature optimization since we could just iterate over
    #   'tocrawl')
    self.scheduled = sets.Set()
    # Format of this list is:
    # [[url1, depth1, { headername : headerval, ... } ], [url2, depth2], {}...]
    self.tocrawl = []
    
    # Fetch the entrypoint to the Google Desktop Search API.

  
  def ExtractLinks(self, baseurl, htmldoc):
    """Returns all anchors from the document with contents 'htmldoc' at
    'baseurl' that are OK to crawl."""
    urls = []
    for match in itertools.chain(_LINK_RE.finditer(htmldoc),
                                 _FRAME_RE.finditer(htmldoc)):
      url = urlparse.urljoin(baseurl, match.group(2))
      if self.rules.IsCrawlable(url):
        urls += [url]
      else:
        print "    I %s" % url
    return urls
  
  def Crawl(self, baseurls):
    '''Performs the crawl.
    
    Args:
      baseurls: [url1, url2, ...]
    '''
    # Bootstrap our invariant of data
    for baseurl in baseurls:
      self.tocrawl.append([baseurl, self.options.depth, None])
    
    if self.options.loop:
      print "Running in loop mode - press Ctrl-C to stop."
    
    while True:
      for crawlitem in self.tocrawl:
        (url, depth, headers) = crawlitem
        try:
          if headers:
            doc = opener.open(urllib2.Request(url, headers=headers))
          else:
            doc = opener.open(url)
          
          doctype = doc.info().type
          if doc.code == 304:  # not modified since last time
            print "--- (nomod) %s" % url
          elif (doc.code == 200 and doctype == 'text/html' or
                doctype == 'text/plain'):
            print "::: (%d) %s" % (depth, url)
            
            # Store last modified in the crawlitem
            # Prefer Last-Modified header, then Date header (to get same
            # formatting as used by the server), then current date in
            # appropriate format.
            last_modified = None
            if 'last_modified' in doc.headers:
              last_modified = fdoc.headers['last_modified']
            elif 'date' in doc.headers:
              last_modified = doc.headers['date']
            else:
              last_modified = email.Utils.formatdate(time.time(), usegmt=True)
            crawlitem[2] = { 'If-Modified-Since' : last_modified }
            
            content = doc.read()
            
            # Create a GDS event, populate its fields, and send it off to have
            # the web page added to the Google Desktop Search index.
            
            #event.AddProperty('format', doctype)
            #event.AddProperty('content', content)
            #event.AddProperty('uri', url)
            fout = file("output.txt",'w');
            print >> fout,"\n****doctype********\n" 
            print >> fout, doctype
            print >> fout,"\n****content********\n"
            print >> fout, content
            print >> fout,"\n******url******\n"
            print >> fout, url

            # TODO Use the last-modified HTTP header instead of current time
            # if available.
            #event.AddProperty('last_modified_time',
            #                  pywintypes.Time(time.time() + time.timezone))
            print >> fout,"\n*****time*********\n"            
            print >> fout, pywintypes.Time(time.time() + time.timezone)

            
            if doctype == 'text/html':  # no links in text documents
              title_match = _TITLE_RE.search(content)
              if title_match:
                title = title_match.group(1)
                #event.AddProperty('title', title)
                print >> fout, "\n*****title*******\n"
                print >> fout, title
              
              for link in self.ExtractLinks(doc.geturl(), content):
                if depth > 0 and not link in self.scheduled:
                  self.scheduled.add(link)
                  self.tocrawl.append([link, depth - 1, None])
  
            # Don't use historical flag, because if we do, GDS will "throttle"
            # the events we send, not returning until the user becomes idle.
            # We also want to ensure the page is updated in the cache (in case
            # the user already visited it herself using a browser).
            #event.Send(0x01)
          else:
            print "!!! (HTTP %d) %s" % (doc.code, url)
    
          doc.close()
        except IOError:
          print "!!! (nolink) %s" % url
        except ValueError:
          print "!!! (noauth) %s" % url
      
      if not self.options.loop:
        break
      else:
        print ("=== Completed crawl; will recrawl in %d minutes." %
               (self.options.sleep))
        time.sleep(60 * self.options.sleep)


def Main():
  '''This function contains the logic for the command-line UI for Kongulo.'''
  
  # Set up options and parse arguments.
  parser = optparse.OptionParser(usage='%prog [options] BASEURL1 BASEURL2 ...')
  parser.add_option('-d', '--depth', type='int', dest='depth', default=0,
                    help='How deep to follow links from BASEURLs (default 0, '
                         'suggest max 5-6)')
  parser.add_option('-m', '--match', dest='match', default='.+',
                    help=r'Regular expression that URLs must match if they are '
                    'to be crawled, e.g. ".+intranet\.smurfgeburf\.com.+" to '
                    'stay within the Smurfgeburf intranet')
  parser.add_option('-l', '--loop', action='store_true', dest='loop',
                    default=False, help='If this flag is given, Kongulo will '
                    'keep fetching the specified page and pages it points to.  '
                    'It will not refetch pages that haven't changed.')
  parser.add_option('-s', '--sleep', type='int', dest='sleep', default=60,
                    help='Number of minutes to sleep before looping (default '
                    '60). Only valid if -l is also specified.')
  parser.add_option('-p', '--passwords', dest='pw',
                    help='Comma-delimited list of user IDs at names that will '
                    'be matched as substrings against the domain or "region" '
                    'that a password is needed for, e.g. '
                    '"joi@google.com,admin@192.168.250.1,snafu@slashdot.org".  '
                    'You will be prompted for each password.')
  (options, args) = parser.parse_args()
  if len(args) < 1:
    parser.error('Provide at least one base URL')
  
#  try:
#    obj = win32com.client.Dispatch('GoogleDesktopSearch.Register')
#  except pythoncom.ole_error:
#    print ('ERROR: You need to install Google Desktop Search to be able to '
#           'use Kongulo.')
 #   sys.exit(2)
  
#  try:
    # Register with GDS.  This is a one-time operation and will return an
    # error if already registered.  We cheat and just catch the error and
    # do nothing.
 #   obj.RegisterComponent(_GUID,
 #            ['Title', 'Kongulo', 'Description', 'A simple web spider that '
 #             'lets you keep copies of web sites in your Google Desktop Search '
 #             'index.', 'Icon', '%SystemRoot%\system32\SHELL32.dll,134'])
    
    # TODO Provide an unregistration mechanism.
 # except:
    # TODO narrow to only the error that GDS returns when component
    # already registered
 #   pass

  passwords.Populate(options)
  Crawler(options).Crawl(args)


if __name__ == '__main__':
  Main()