Python: Web Crawler

By Xah Lee. Date: . Last updated: .

Here's a simple web crawler in Python.

# craw a website, list all url under a specific given path

inputURL = "http://ergoemacs.github.io/ergoemacs-mode/"

resultUrl = {inputURL:False}
# key is a url we want. value is True or False. True means already crawled

# from urllib import urlopen
import urllib.request, urllib.error, urllib.parse
import urllib.parse
import time
import pprint

# get html links
from bs4 import BeautifulSoup
# import BeautifulSoup

def processOneUrl(url):
    """fetch URL content and update resultUrl."""
    try:    # in case of 404 error
        html_page = urllib.request.urlopen(url)
        soup = BeautifulSoup.BeautifulSoup(html_page)
        for link in soup.findAll('a'):
            fullurl = urllib.parse.urljoin(url, link.get('href'))
            if fullurl.startswith(inputURL):
                if (fullurl not in resultUrl):
                    resultUrl[fullurl] = False
        resultUrl[url] = True # set as crawled
    except:
        resultUrl[url] = True   # set as crawled

def moreToCrawl():
    """returns True or False"""
    for url, crawled in iter(resultUrl.items()):
        if not crawled:
            print(("moreToCrawl found {}".format(url)))
            return url
    return False

while True:
    toCrawl = moreToCrawl()
    if not toCrawl:
        break
    processOneUrl(toCrawl)
    time.sleep(2)

pprint.pprint(resultUrl)
# -*- coding: utf-8 -*-
# python 2

# craw a website, list all url under a specific given path

inputURL = "http://ergoemacs.github.io/ergoemacs-mode/"

resultUrl = {inputURL:False}
# key is a url we want. value is True or False. True means already crawled

# from urllib import urlopen
import urllib2
import urlparse
import time
import pprint

import BeautifulSoup # get html links

def processOneUrl(url):
    """fetch URL content and update resultUrl."""
    try:    # in case of 404 error
        html_page = urllib2.urlopen(url)
        soup = BeautifulSoup.BeautifulSoup(html_page)
        for link in soup.findAll('a'):
            fullurl = urlparse.urljoin(url, link.get('href'))
            if fullurl.startswith(inputURL):
                if (fullurl not in resultUrl):
                    resultUrl[fullurl] = False
        resultUrl[url] = True # set as crawled
    except:
        resultUrl[url] = True   # set as crawled

def moreToCrawl():
    """returns True or False"""
    for url, crawled in iter(resultUrl.iteritems()):
        if not crawled:
            print("moreToCrawl found {}".format(url))
            return url
    return False

while True:
    toCrawl = moreToCrawl()
    if not toCrawl:
        break
    processOneUrl(toCrawl)
    time.sleep(2)

pprint.pprint(resultUrl)

sample output:

◆ python xx-testscript-4925873.py
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/design-basis.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/testimonials.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/archives.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/key-setup.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/bug-report.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/faq.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/changelog.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/customize-keys.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/minor-modes.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/features.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/smart-commands.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/aliases.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/tags/index.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/standard-shortcuts.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/gradual-adoption.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/index.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/key-themes.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/tags/design-basis.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/keyboard-layouts.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/cua-conflict.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/roadmap.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/banish-key-chords.html
moreToCrawl found http://ergoemacs.github.io/ergoemacs-mode/system-wide.html
{'http://ergoemacs.github.io/ergoemacs-mode/': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/aliases.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/archives.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/banish-key-chords.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/bug-report.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/changelog.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/cua-conflict.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/customize-keys.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/design-basis.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/faq.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/features.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/gradual-adoption.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/index.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/key-setup.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/key-themes.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/keyboard-layouts.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/minor-modes.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/roadmap.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/smart-commands.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/standard-shortcuts.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/system-wide.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/tags/design-basis.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/tags/index.html': True,
 u'http://ergoemacs.github.io/ergoemacs-mode/testimonials.html': True}

2014-01-25 thanks to Sorawee Porncharoenwase [ https://plus.google.com/+SoraweePorncharoenwase/posts ] for improvement

discuss on Google Plus https://plus.google.com/+XahLee/posts/63YyJ79TypK