#coding=cp936 #$Id: ReceiveURL.py,v 1.4 2004/03/27 13:09:48 liyinghui Exp $ import sys from sgmllib import SGMLParser from os.path import normpath, normcase, join, split, splitext, isabs, exists class ReceiveURL(SGMLParser): def reset(self): SGMLParser.reset(self) self.url=[] def start_a(self, attrs): for attr, value in attrs: if attr.lower()=="href": #是链接,则变换链接,并加入锚点,同 href=value if href.find("#")>0: href=href[:href.index("#")] if href.find("?")>0: href=href[:href.index("?")] file,ext=splitext(href) if self.fileexts: if ext: if ext in self.fileexts: #是否有此后缀,如果有,则提出链接,做为下个文件处理 self.addURL(href) else: self.logger.info('Skip extension [%s]' % ext) else: self.addURL(href) def start_link(self, attrs): for attr, value in attrs: if attr.lower()=="href": href=value self.addURL(href) def start_frame(self, attrs): for attr, value in attrs: if attr.lower()=="src": href=value self.addURL(href) def start_img(self, attrs): for attr, value in attrs: if attr.lower()=="src": href=value self.addURL(href) def addURL(self, url): if url not in self.url: self.url += [url] def output(self): print 'total=', len(self.url) for u in self.url: print '['+u+']' def run(self, filename, fileexts, htmlexts, logger): self.fileexts=fileexts #fileexts为可下载文件后缀列表 self.logger = logger fname, ext = splitext(filename) if ext in htmlexts: text=open(filename).read() self.feed(text) self.close() def getURLs(self): return self.url if __name__=='__main__': r=ReceiveURL() r.run(sys.argv[1]) r.output() -------------------------------------- #coding=cp936 #!/usr/bin/env python """CRAWL V2.0 copyright (c) limodou(chatme at 263.net) This is a free software. It's destributed under the terms of GPL. You can use it to grab HTML documents begin with specified url. When you give a url, it'll first grab it, and parses all the links in the page, then grab them all. Enjoy it! $Id: CRAWL.PY,v 1.2 2004/03/25 05:28:33 liyinghui Exp $ """ from sys import argv from os import makedirs, unlink from os.path import isdir, exists, dirname, splitext from string import replace, find, lower from htmllib import HTMLParser from urllib import urlretrieve from urlparse import urlparse, urljoin from formatter import DumbWriter, AbstractFormatter from cStringIO import StringIO import urllib2, getopt, sys, os, urllib from ReceiveURL import ReceiveURL import threading import time import traceback import ConfigParser import datetime proxyflag = 0 seenfile = 'seen.txt' downfile = 'down.txt' inifile = '.crawl.ini' logfile = 'crawl.log' logger = None tflag = False class Retriever: # download Web pages def __init__(self, url, fileexts, htmlexts): self.url = url self.fileexts = fileexts self.htmlexts = htmlexts self.file = self.filename(url) self.r = ReceiveURL() def filename(self, url, deffile='index.html'): parsedurl = urlparse(url, 'http:', 0) # parse path path = parsedurl[1] + parsedurl[2] ext = splitext(path) if ext[1] == '': if path[-1] == '/': path = path + deffile else: path = path + '/' + deffile dir = dirname(path) if not isdir(dir): # create archive dir if nec. if exists(dir): unlink(dir) try: makedirs(dir) except: pass flag=0 if parsedurl[3]: path += '_'+parsedurl[3] flag=1 if parsedurl[4]: path += '_'+parsedurl[4] flag=1 if flag: path += '.htm' return path def download(self): # download Web page try: # retval = urllib.urlretrieve(self.url, self.file) #add if proxyflag: f=urllib2.urlopen(self.url) else: f=urllib.urlopen(self.url) open(self.file, 'wb').write(f.read()) retval=self.url, f.headers #add end except Exception, e: logger.error(str(e)) retval = ('*** ERROR: invalid URL "%s"' % self.url, ) return retval def parseAndGetLinks(self): # pars HTML, save links self.r.run(self.file, self.fileexts, self.htmlexts, logger) return self.r.getURLs() class Crawler: # manage entire crawling process count = 0 # static downloaded page counter def __init__(self, url, seen, exts, htmlexts): #url是一个未下载的URL列表, seen是一个已经下载完毕的URL列表, exts是可下载文件名后缀 self.q=url[:] self.seen = seen[:] self.exts = exts self.htmlexts = htmlexts self.lock = threading.Lock() parse=urlparse(url[0]) self.dom = parse[1] self.basepath = parse[0]+'://'+parse[1]+dirname(parse[2]) # start path, everything inside the dir will be grabbed print 'Starting URL is: %s\n' % self.basepath def addDownLoadedURL(self, url): """加入已经下载完的url""" self.lock.acquire() self.seen.append(url) Crawler.count = Crawler.count + 1 open(seenfile, "w").write("\n".join(self.seen)) self.lock.release() def getDownloadURL(self): self.lock.acquire() if len(self.q) > 0: url = self.q[0] self.q.remove(url) open(downfile, "w").write("\n".join(self.q)) else: url = '' self.lock.release() return url def getPage(self, url): self.addDownLoadedURL(url) r = Retriever(url, self.exts, self.htmlexts) if url.startswith(self.basepath): print threading.currentThread().getName(), 'GETING ~'+url[len(self.basepath):] else: print threading.currentThread().getName(), 'GETING '+url retval = r.download() if retval[0][0] == '*': # error situation, do not parse print ' >>>> ERROR: skipping parse' return #print '\n(', Crawler.count, ')' #print 'URL:', url #print 'FILE:', retval[0] links = r.parseAndGetLinks() # get and process links self.lock.acquire() for eachLink in links: if eachLink[:4] != 'http' and find(eachLink, '://') == -1: eachLink = urljoin(url, eachLink).split('#')[0] path = dirname(eachLink) else: path = dirname(eachLink) if find(lower(eachLink), 'mailto:') != -1: #print '... discarded, mailto link' continue if eachLink not in self.seen: if find(eachLink, self.dom) == -1 or not path.startswith(self.basepath): #print '... discarded, not in domain' and path is not starting path pass else: if not eachLink in self.q: self.q.append(eachLink) #print '... new, added to Q' else: #print '... discarded, already in Q' pass else: #print '... discarded, already processed' pass open(downfile, "w").write("\n".join(self.q)) self.lock.release() def go(self, threadnum): # process links in queue global tflag starttime = datetime.datetime.now() threads = [] for i in range(threadnum): t = MyThread(self) t.setDaemon(True) threads.append(t) for i in range(threadnum): threads[i].start() while 1: try: if len(self.q) > 0: time.sleep(0.1) continue f = False for i in range(threadnum): if threads[i].active: f = True break if f: time.sleep(0.1) continue break # url = self.q[0] # self.q.remove(url) # open(downfile, "w").write("\n".join(self.q)) # self.getPage(url) except: traceback.print_exc() break tflag = True endtime = datetime.datetime.now() print "Retrieved total %d files in %d seconds." % (Crawler.count, (endtime - starttime).seconds) class MyThread(threading.Thread): def __init__(self, robot): self.active = False self.robot = robot threading.Thread.__init__(self) def run(self): while 1 and not tflag: url = self.robot.getDownloadURL() if url: self.active = True self.robot.getPage(url) self.active = False else: time.sleep(0.1) def initlog(): import logging global logger logger = logging.getLogger() hdlr = logging.FileHandler(logfile) formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') hdlr.setFormatter(formatter) logger.addHandler(hdlr) logger.setLevel(logging.NOTSET) return logger def usage(): print ''' CRAWL V2.0 copyright (c) limodou(chatme at 263.net) This is a free software. It's destributed under the terms of GPL. You can use it to grab HTML documents begin with specified url. When you give a url, it'll first grab it, and parses all the links in the page, then grab them all. Enjoy it! Command line usage: python crawl.py [-p proxy|-t num|-f configfile] [url|-r] -p proxy = http://[username:password@]hostname:port -t num thread num -f configfile default use .crawl.ini file or python crawl.py -u ''' def main(): global proxyflag global inifile try: opts, args = getopt.getopt(sys.argv[1:], "p:urt:f:", []) except getopt.GetoptError: usage() sys.exit(2) proxyhost='' resume=0 seen=[] threadnum = 10 for o, a in opts: if o == '-u': usage() sys.exit() elif o == '-p': proxyhost=a proxyflag=1 elif o == '-r': url=[u.strip() for u in open(downfile).readlines()] seen=[u.strip() for u in open(seenfile).readlines() if u.strip()] elif o == '-t': try: threadnum = int(a) except: threadnum = 10 if threadnum == 0: threadnum = 10 elif o == '-f': inifile = a #args[0] = "http://localhost:8088/index.html" url="" if len(args) > 0: url = [args[0]] else: if not url: try: u = raw_input('Enter starting URL: ') url = [u] except (KeyboardInterrupt, EOFError): url = '' if proxyhost: # proxy=urllib2.ProxyHandler({'http':'http://www:www@11.133.232.19:8080'}) print "\nProxy is: %s" % proxyhost proxy=urllib2.ProxyHandler({'http':proxyhost}) opener=urllib2.build_opener(proxy) urllib2.install_opener(opener) if not url: return ini = ConfigParser.ConfigParser() ini.read(inifile) exts = [] if ini.has_option('default', 'exts'): exts = ini.get('default', 'exts').split() if not exts: exts = ['.htm', '.html', '.gif', '.jpg', '.png', '.py', '.txt', '.css', '.js', '.aspx'] htmlexts = [] if ini.has_option('default', 'htmlexts'): htmlexts = ini.get('default', 'htmlexts').split() if not htmlexts: htmlexts = ['.htm', '.html'] logger = initlog() robot = Crawler(url, seen, exts, htmlexts) robot.go(threadnum) if __name__ == '__main__': main()