Python脚本使用urllib3和BeautifulSoup4实现网页抓取和分析

好久没有用Python来做网页抓取和分析了, 这次有个客户指定要用Python, 原来的脚本用到urllib2居然都已经下不到了. 唯一可用的是urllib3, 不过Python还是一如既往的方便, 很快就把所有的依赖全部搞定, 代码页非常简单.

首先按照需要用到的依赖, 执行如下命令.

 
py -m pip install bs4
py -m pip install certifi
py -m pip install urllib3
py -m pip install simplejson
 

import语句:

 
import pprint
import urllib3, certifi
import simplejson
import sys
import subprocess
import json
import re
from bs4 import BeautifulSoup as Soup
from datetime import datetime
 
pp = pprint.PrettyPrinter(indent=4)
 

分析页面的时候要用到soupselect, 可以像jQuery一样选择DOM元素, 其实只是一组函数, 可以直接拷贝到脚本里面:

 
tag_re = re.compile('^[a-z0-9]+$')
 
attribselect_re = re.compile(
    r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' + 
    r'=?"?(?P<value>[^\]"]*)"?\]$'
)
 
# /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
#   \---/  \---/\-------------/    \-------/
#     |      |         |               |
#     |      |         |           The value
#     |      |    ~,|,^,$,* or =
#     |   Attribute 
#    Tag
 
def attribute_checker(operator, attribute, value=''):
    """
    Takes an operator, attribute and optional value; returns a function that
    will return True for elements that match that combination.
    """
    return {
        '=': lambda el: el.get(attribute) == value,
        # attribute includes value as one of a set of space separated tokens
        '~': lambda el: value in el.get(attribute, '').split(),
        # attribute starts with value
        '^': lambda el: el.get(attribute, '').startswith(value),
        # attribute ends with value
        '$': lambda el: el.get(attribute, '').endswith(value),
        # attribute contains value
        '*': lambda el: value in el.get(attribute, ''),
        # attribute is either exactly value or starts with value-
        '|': lambda el: el.get(attribute, '') == value \
            or el.get(attribute, '').startswith('%s-' % value),
    }.get(operator, lambda el: el.has_key(attribute))
 
 
def select(soup, selector):
    """
    soup should be a BeautifulSoup instance; selector is a CSS selector 
    specifying the elements you want to retrieve.
    """
    tokens = selector.split()
    current_context = [soup]
    for token in tokens:
        m = attribselect_re.match(token)
        if m:
            # Attribute selector
            tag, attribute, operator, value = m.groups()
            if not tag:
                tag = True
            checker = attribute_checker(operator, attribute, value)
            found = []
            for context in current_context:
                found.extend([el for el in context.findAll(tag) if checker(el)])
            current_context = found
            continue
        if '#' in token:
            # ID selector
            tag, id = token.split('#', 1)
            if not tag:
                tag = True
            el = current_context[0].find(tag, {'id': id})
            if not el:
                return [] # No match
            current_context = [el]
            continue
        if '.' in token:
            # Class selector
            tag, klass = token.split('.', 1)
            if not tag:
                tag = True
            found = []
            for context in current_context:
                found.extend(
                    context.findAll(tag,
                        {'class': lambda attr: attr and klass in attr.split()}
                    )
                )
            current_context = found
            continue
        if token == '*':
            # Star selector
            found = []
            for context in current_context:
                found.extend(context.findAll(True))
            current_context = found
            continue
        # Here we should just have a regular tag
        if not tag_re.match(token):
            return []
        found = []
        for context in current_context:
            found.extend(context.findAll(token))
        current_context = found
    return current_context
 
def monkeypatch(BeautifulSoupClass=None):
    """
    If you don't explicitly state the class to patch, defaults to the most 
    common import location for BeautifulSoup.
    """
    if not BeautifulSoupClass:
        from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
    BeautifulSoupClass.findSelect = select
 
def unmonkeypatch(BeautifulSoupClass=None):
    if not BeautifulSoupClass:
        from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
    delattr(BeautifulSoupClass, 'findSelect')
 
 

获取网页源文件

 
def download_content_source(url):
    http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
    response = http.request('GET', url)
    return response.data.decode('utf-8')
 

直接用正则表达式匹配文本, 下面的代码会将"var packagedContests = ["和"true}];"之间的内容提取出来, 这是一段JSON代码.

 
    m = re.search("var packagedContests = \[([\\s\\S]*?)true}\];", response.data.decode('utf-8'))
    js_data = "[" + m.group(1) + "true}]"    
    json = simplejson.loads(js_data)
 
i = 0    
for x in json:
    i = i + 1
    if i > 4:
        break
    pp.pprint(scrape_contest(str(x['id'])))
 
 

建立DOM

 
    content = download_content_source(url)
    soup = Soup(content, "html.parser")
 

获取单个DOM元素包含的文本内容

 
title = select(soup , ".contest-info h1")[0].contents[0]
 

匹配一组元素

 
    items = select(soup , ".well-inner div p")
    item1 = items[0].contents[0]
    item2 = items[1].span.contents[0]
    item3 = items[2].contents[0]
    item4 = items[3].contents[0]
 

匹配一组td, 但是只取其中的第二列

 
    tds = select(soup , ".grid tr td")[1::2]