Python脚本使用urllib3和BeautifulSoup4实现网页抓取和分析
好久没有用Python来做网页抓取和分析了, 这次有个客户指定要用Python, 原来的脚本用到urllib2居然都已经下不到了. 唯一可用的是urllib3, 不过Python还是一如既往的方便, 很快就把所有的依赖全部搞定, 代码页非常简单.
首先按照需要用到的依赖, 执行如下命令.
py -m pip install bs4 py -m pip install certifi py -m pip install urllib3 py -m pip install simplejson
import语句:
import pprint import urllib3, certifi import simplejson import sys import subprocess import json import re from bs4 import BeautifulSoup as Soup from datetime import datetime pp = pprint.PrettyPrinter(indent=4)
分析页面的时候要用到soupselect, 可以像jQuery一样选择DOM元素, 其实只是一组函数, 可以直接拷贝到脚本里面:
tag_re = re.compile('^[a-z0-9]+$') attribselect_re = re.compile( r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' + r'=?"?(?P<value>[^\]"]*)"?\]$' ) # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ # \---/ \---/\-------------/ \-------/ # | | | | # | | | The value # | | ~,|,^,$,* or = # | Attribute # Tag def attribute_checker(operator, attribute, value=''): """ Takes an operator, attribute and optional value; returns a function that will return True for elements that match that combination. """ return { '=': lambda el: el.get(attribute) == value, # attribute includes value as one of a set of space separated tokens '~': lambda el: value in el.get(attribute, '').split(), # attribute starts with value '^': lambda el: el.get(attribute, '').startswith(value), # attribute ends with value '$': lambda el: el.get(attribute, '').endswith(value), # attribute contains value '*': lambda el: value in el.get(attribute, ''), # attribute is either exactly value or starts with value- '|': lambda el: el.get(attribute, '') == value \ or el.get(attribute, '').startswith('%s-' % value), }.get(operator, lambda el: el.has_key(attribute)) def select(soup, selector): """ soup should be a BeautifulSoup instance; selector is a CSS selector specifying the elements you want to retrieve. """ tokens = selector.split() current_context = [soup] for token in tokens: m = attribselect_re.match(token) if m: # Attribute selector tag, attribute, operator, value = m.groups() if not tag: tag = True checker = attribute_checker(operator, attribute, value) found = [] for context in current_context: found.extend([el for el in context.findAll(tag) if checker(el)]) current_context = found continue if '#' in token: # ID selector tag, id = token.split('#', 1) if not tag: tag = True el = current_context[0].find(tag, {'id': id}) if not el: return [] # No match current_context = [el] continue if '.' in token: # Class selector tag, klass = token.split('.', 1) if not tag: tag = True found = [] for context in current_context: found.extend( context.findAll(tag, {'class': lambda attr: attr and klass in attr.split()} ) ) current_context = found continue if token == '*': # Star selector found = [] for context in current_context: found.extend(context.findAll(True)) current_context = found continue # Here we should just have a regular tag if not tag_re.match(token): return [] found = [] for context in current_context: found.extend(context.findAll(token)) current_context = found return current_context def monkeypatch(BeautifulSoupClass=None): """ If you don't explicitly state the class to patch, defaults to the most common import location for BeautifulSoup. """ if not BeautifulSoupClass: from BeautifulSoup import BeautifulSoup as BeautifulSoupClass BeautifulSoupClass.findSelect = select def unmonkeypatch(BeautifulSoupClass=None): if not BeautifulSoupClass: from BeautifulSoup import BeautifulSoup as BeautifulSoupClass delattr(BeautifulSoupClass, 'findSelect')
获取网页源文件
def download_content_source(url): http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) response = http.request('GET', url) return response.data.decode('utf-8')
直接用正则表达式匹配文本, 下面的代码会将"var packagedContests = ["和"true}];"之间的内容提取出来, 这是一段JSON代码.
m = re.search("var packagedContests = \[([\\s\\S]*?)true}\];", response.data.decode('utf-8')) js_data = "[" + m.group(1) + "true}]" json = simplejson.loads(js_data) i = 0 for x in json: i = i + 1 if i > 4: break pp.pprint(scrape_contest(str(x['id'])))
建立DOM
content = download_content_source(url) soup = Soup(content, "html.parser")
获取单个DOM元素包含的文本内容
title = select(soup , ".contest-info h1")[0].contents[0]
匹配一组元素
items = select(soup , ".well-inner div p") item1 = items[0].contents[0] item2 = items[1].span.contents[0] item3 = items[2].contents[0] item4 = items[3].contents[0]
匹配一组td, 但是只取其中的第二列
tds = select(soup , ".grid tr td")[1::2]