No older revisions available
No older revisions available
~cpp
#!/usr/bin/env python
# Copyright (c) 2005, Google Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following disclaimer
# in the documentation and/or other materials provided with the
# distribution.
# * Neither the name of Google Inc. nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import robotparser
import urllib
import urllib2
import re
import sets
import sys
import urlparse
import win32com.client
import time
import pywintypes
import pythoncom
import optparse
import getpass
import itertools
import email.Utils
'''A simple web crawler that pushes pages into GDS. Features include:
- Knows basic and digest HTTP authentication
- Obeys robots.txt
- Can loop, recrawling over previously crawled pages every X minutes
- When recrawling, uses If-Modified-Since HTTP header to minimize transfers
For usage instructions, run with -h flag.
Requires Python 2.4 and the win32all extensions for Python 2.4 on Windows.
Will not work unless Google Desktop Search 1.0 or later is installed.
'''
# Matches URLs in <a href=...> tags. Chosen above htmllib.HTMLParser because
# this is much more lenient, not requiring HTML to be valid.
_LINK_RE = re.compile(r'<\s*(a|img).+href\s*=\s*"?(.+?)"?(\s|>)',
re.MULTILINE | re.IGNORECASE)
# Matches <frame src="bla"> tags.
_FRAME_RE = re.compile(r'<\s*(frame).+src\s*=\s*"?(.+?)"?(\s|>)',
re.MULTILINE | re.IGNORECASE)
# Digs out the text of an HTML document's title.
_TITLE_RE = re.compile(r'<\s*title.*?>(.+)</\s*title\s*>',
re.MULTILINE | re.IGNORECASE)
# This plugin's GUID, used to register with GDS.
_GUID = '{5e1788fe-a6e6-429f-816c-80cb969028d3}'
class NoExceptionHandler(urllib2.BaseHandler):
'''An exception handler for HTTP that never throws an exception for various
error codes that Kongulo always checks explicitly rather than catching them
as exceptions.'''
def http_error_304(self, req, fp, code, msg, hdrs):
'''We handle not-modified-since explicitly.'''
return fp
# We check error codes explicitly so we don't want an exception
http_error_400 = http_error_401 = http_error_402 = http_error_403 \
= http_error_404 = http_error_304
class PasswordDb(urllib2.HTTPPasswordMgr):
'''A very simple password store. The user can supply usernames using the
-p flag on the command line, and will be prompted for the password for
each username.'''
def __init__(self):
self.passwords = [] # [ [substring, uid, pw], [substring, uid, pw] ]
def Populate(self, options):
'''Given an options object as used by Kongulo, ask the user for the
password for each user-id/substring-of-domain that the user provided using
the -p flag.'''
if not options.pw:
return
for item in options.pw.split(','):
(uid, substring) = item.split('@')
pw = getpass.getpass('Enter password for %s: ' % item)
self.passwords.append([substring, uid, pw])
def find_user_password(self, *args, **kw):
for passdata in self.passwords:
for name in args:
if name.find(passdata[0]) != -1:
return (passdata[1], passdata[2])
print "!!! Need login info for (%s @ %s), consider using -p flag" % args
return (None, None)
passwords = PasswordDb()
# A URL opener that can do basic and digest authentication, and never raises
# exceptions for HTTP error codes we handle explicitly.
opener = urllib2.build_opener(urllib2.HTTPBasicAuthHandler(passwords),
urllib2.HTTPDigestAuthHandler(passwords),
NoExceptionHandler())
# To be a nice Internet citizen, we identify ourselves properly so that
# whoever doesn't like Kongulo can exclude us using robots.txt
opener.addheaders = [('User-agent', 'Kongulo v0.1 personal web crawler')]
# Should always be true on Windows systems.
assert hasattr(opener.handlers[0],
'proxies'), 'ProxyHandler must be first handler.'
# This parses Windows proxy registry settings
opener.handlers[0].proxies = urllib.getproxies()
class LenientRobotParser(robotparser.RobotFileParser):
'''Adds ability to parse robot files where same user agent is specified
multiple times.'''
def __init__(self, url):
'''Setup internal state like RobotFileParser does.'''
robotparser.RobotFileParser.__init__(self)
f = opener.open(url)
lines = []
line = f.readline()
while line:
lines.append(line.strip())
line = f.readline()
self.errcode = f.code
if self.errcode == 401 or self.errcode == 403:
self.disallow_all = 1
elif self.errcode >= 400:
self.allow_all = 1
elif self.errcode == 200 and lines:
self.parse(lines)
def parse(self, lines):
"""Strip repeated sequential definitions of same user agent, then
call base's parse method."""
last_ua = ''
modified_lines = []
for line in lines:
line
if line.lower().startswith('user-agent'):
temp = last_ua
last_ua = line.lower()
if last_ua == temp:
continue # skip line
if line.strip() == '':
last_ua = '' # reset on blank line
modified_lines += [line]
robotparser.RobotFileParser.parse(self, modified_lines)
class UrlValidator:
'''An object that handles checking if we should fetch and crawl a specific
URL. This is based on the type of the URL (only crawl http URLs) and robot
rules. Maintains a cache of robot rules already fetched.'''
def __init__(self, match_url):
self.robots = {} # Dict of robot URLs to robot parsers
self.match_url = re.compile(match_url)
def IsCrawlable(self, url):
"""Returns true if it's OK to crawl the absolute URL provided."""
if not url.startswith('http') or not self.match_url.match(url):
return 0
return self.GetRules(url).can_fetch('*', url)
def GetRules(self, url):
"""Returns the robot rules parser for 'url'"""
robots_dir = urlparse.urljoin(url, "robots.txt") # First try dir-level
if robots_dir in self.robots:
return self.robots[robots_dir]
robots_site = urlparse.urljoin(url, "/robots.txt") # Then the site-level
if robots_site in self.robots:
return self.robots[robots_site]
# Inv: Our cache contains neither a dir-level nor site-level robots.txt file
rules = LenientRobotParser(robots_dir) # First try dir-level
if hasattr(rules, 'errcode') and rules.errcode == 200:
self.robots[robots_dir] = rules
else:
rules = LenientRobotParser(robots_site) # Then try site-level
self.robots[robots_site] = rules
return rules
class Crawler:
'''This object holds the state of the crawl, and performs the crawl.'''
def __init__(self, options):
self.options = options # Store the options provided
self.rules = UrlValidator(options.match) # Cache of robot rules etc.
# Invariant of data:
# - 'tocrawl' is a list of items that we have or will crawl. If we have
# never crawled them since we started, the item at index 2 in each
# crawlitem is None, otherwise it is a dictionary of headers,
# specifically the 'If-Modified-Since' header, to prevent us from fetching
# this item in the next crawl if it hasn't been modified.
# - 'scheduled' is a list of items we have already added to 'tocrawl'
# (perhaps a premature optimization since we could just iterate over
# 'tocrawl')
self.scheduled = sets.Set()
# Format of this list is:
# [[url1, depth1, { headername : headerval, ... } ], [url2, depth2], {}...]
self.tocrawl = []
# Fetch the entrypoint to the Google Desktop Search API.
def ExtractLinks(self, baseurl, htmldoc):
"""Returns all anchors from the document with contents 'htmldoc' at
'baseurl' that are OK to crawl."""
urls = []
for match in itertools.chain(_LINK_RE.finditer(htmldoc),
_FRAME_RE.finditer(htmldoc)):
url = urlparse.urljoin(baseurl, match.group(2))
if self.rules.IsCrawlable(url):
urls += [url]
else:
print " I %s" % url
return urls
def Crawl(self, baseurls):
'''Performs the crawl.
Args:
baseurls: [url1, url2, ...]
'''
# Bootstrap our invariant of data
for baseurl in baseurls:
self.tocrawl.append([baseurl, self.options.depth, None])
if self.options.loop:
print "Running in loop mode - press Ctrl-C to stop."
while True:
for crawlitem in self.tocrawl:
(url, depth, headers) = crawlitem
try:
if headers:
doc = opener.open(urllib2.Request(url, headers=headers))
else:
doc = opener.open(url)
doctype = doc.info().type
if doc.code == 304: # not modified since last time
print "--- (nomod) %s" % url
elif (doc.code == 200 and doctype == 'text/html' or
doctype == 'text/plain'):
print "::: (%d) %s" % (depth, url)
# Store last modified in the crawlitem
# Prefer Last-Modified header, then Date header (to get same
# formatting as used by the server), then current date in
# appropriate format.
last_modified = None
if 'last_modified' in doc.headers:
last_modified = fdoc.headers['last_modified']
elif 'date' in doc.headers:
last_modified = doc.headers['date']
else:
last_modified = email.Utils.formatdate(time.time(), usegmt=True)
crawlitem[2] = { 'If-Modified-Since' : last_modified }
content = doc.read()
# Create a GDS event, populate its fields, and send it off to have
# the web page added to the Google Desktop Search index.
#event.AddProperty('format', doctype)
#event.AddProperty('content', content)
#event.AddProperty('uri', url)
fout = file("output.txt",'w');
print >> fout,"\n****doctype********\n"
print >> fout, doctype
print >> fout,"\n****content********\n"
print >> fout, content
print >> fout,"\n******url******\n"
print >> fout, url
# TODO Use the last-modified HTTP header instead of current time
# if available.
#event.AddProperty('last_modified_time',
# pywintypes.Time(time.time() + time.timezone))
print >> fout,"\n*****time*********\n"
print >> fout, pywintypes.Time(time.time() + time.timezone)
if doctype == 'text/html': # no links in text documents
title_match = _TITLE_RE.search(content)
if title_match:
title = title_match.group(1)
#event.AddProperty('title', title)
print >> fout, "\n*****title*******\n"
print >> fout, title
for link in self.ExtractLinks(doc.geturl(), content):
if depth > 0 and not link in self.scheduled:
self.scheduled.add(link)
self.tocrawl.append([link, depth - 1, None])
# Don't use historical flag, because if we do, GDS will "throttle"
# the events we send, not returning until the user becomes idle.
# We also want to ensure the page is updated in the cache (in case
# the user already visited it herself using a browser).
#event.Send(0x01)
else:
print "!!! (HTTP %d) %s" % (doc.code, url)
doc.close()
except IOError:
print "!!! (nolink) %s" % url
except ValueError:
print "!!! (noauth) %s" % url
if not self.options.loop:
break
else:
print ("=== Completed crawl; will recrawl in %d minutes." %
(self.options.sleep))
time.sleep(60 * self.options.sleep)
def Main():
'''This function contains the logic for the command-line UI for Kongulo.'''
# Set up options and parse arguments.
parser = optparse.OptionParser(usage='%prog [options] BASEURL1 BASEURL2 ...')
parser.add_option('-d', '--depth', type='int', dest='depth', default=0,
help='How deep to follow links from BASEURLs (default 0, '
'suggest max 5-6)')
parser.add_option('-m', '--match', dest='match', default='.+',
help=r'Regular expression that URLs must match if they are '
'to be crawled, e.g. ".+intranet\.smurfgeburf\.com.+" to '
'stay within the Smurfgeburf intranet')
parser.add_option('-l', '--loop', action='store_true', dest='loop',
default=False, help='If this flag is given, Kongulo will '
'keep fetching the specified page and pages it points to. '
'It will not refetch pages that haven't changed.')
parser.add_option('-s', '--sleep', type='int', dest='sleep', default=60,
help='Number of minutes to sleep before looping (default '
'60). Only valid if -l is also specified.')
parser.add_option('-p', '--passwords', dest='pw',
help='Comma-delimited list of user IDs at names that will '
'be matched as substrings against the domain or "region" '
'that a password is needed for, e.g. '
'"joi@google.com,admin@192.168.250.1,snafu@slashdot.org". '
'You will be prompted for each password.')
(options, args) = parser.parse_args()
if len(args) < 1:
parser.error('Provide at least one base URL')
# try:
# obj = win32com.client.Dispatch('GoogleDesktopSearch.Register')
# except pythoncom.ole_error:
# print ('ERROR: You need to install Google Desktop Search to be able to '
# 'use Kongulo.')
# sys.exit(2)
# try:
# Register with GDS. This is a one-time operation and will return an
# error if already registered. We cheat and just catch the error and
# do nothing.
# obj.RegisterComponent(_GUID,
# ['Title', 'Kongulo', 'Description', 'A simple web spider that '
# 'lets you keep copies of web sites in your Google Desktop Search '
# 'index.', 'Icon', '%SystemRoot%\system32\SHELL32.dll,134'])
# TODO Provide an unregistration mechanism.
# except:
# TODO narrow to only the error that GDS returns when component
# already registered
# pass
passwords.Populate(options)
Crawler(options).Crawl(args)
if __name__ == '__main__':
Main()