我用了python爬虫中很有名的第三方库beautiful soup库来完成爬取,而这一代码其实是改进了简书上一位博主的代码,他的代码是16年的,其中有很多东西放到现在已经需要改动,而最重要的爬取信息的方式还需要重写,我对这些问题进行了解决并优化了一些他的方法,于是总结在此。
import requests
from bs4 import BeautifulSoup
import time
import re
import pymysql
import random
def get_tag():
channel = []
headers = {'User-Agent': "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"}
# 利用request模块进行访问获取内容
url = "https://book.douban.com/tag/?icn=index-nav"
wb_data = requests.get(url, headers=headers)
soup = BeautifulSoup(wb_data.text, "lxml") # 解析网页信息
tags = soup.select("table.tagCol > tbody > tr > td > a")
# 根据CSS路径查找标签信息,CSS路径获取方法,右键-检查-copy selector,tags返回的是一个列表
for tag in tags:
tag = tag.get_text() # 将列表中的每一个标签信息提取出来
helf = "https://book.douban.com/tag/"
# 观察一下豆瓣的网址,基本都是这部分加上标签信息,所以我们要组装网址,用于爬取标签详情页
url = helf + str(tag)
channel.append(url) # 网址组装完毕,输出
return channel
def ceshi_price_one(detail):
price = detail.split("/", 4)[4].split()
if re.match("USD", price[0]):
price = float(price[1]) * 6
elif re.match("CNY", price[0]):
price = price[1]
elif re.match("\A$", price[0]):
price = float(price[1:len(price)]) * 6
else:
price = price[0]
return price
def ceshi_price_two(detail):
price = detail.split("/", 3)[3].split()
if re.match("USD", price[0]):
price = float(price[1]) * 6
elif re.match("CNY", price[0]):
price = price[1]
elif re.match("\A$", price[0]):
price = float(price[1:len(price)]) * 6
else:
price = price[0]
return price
def mains(url):
headers = {'User-Agent': "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"}
wb_data = requests.get(url, headers=headers)
soup = BeautifulSoup(wb_data.text.encode("utf-8"), "lxml")
tag = url.split("?")[0].split("/")[-1]
content_list = soup.select("#subject_list > ul > li")
titles = []
details = []
scores = []
persons = []
for lis in content_list:
title = lis.select("div.info > h2 > a") # 书名
if title:
title = title[0].get_text().replace("\n", '').replace(" ", '').strip()
else:
title = ''
titles.append(title)
detail = lis.select("div.info > div.pub")
if detail:
detail = detail[0].get_text().replace("\n", '').replace(" ", '').strip()
else:
detail = ''
details.append(detail)
score = lis.select("div.info > div.star.clearfix > span.rating_nums")
if score:
score = score[0].get_text().replace("\n", '').replace(" ", '').strip()
else:
score = ''
scores.append(score)
person = lis.select("div.info > div.star.clearfix > span.pl") # 评价人数
if person:
person = person[0].get_text().replace("\n", '').replace(" ", '').strip().replace("人评价)", "").replace("(",
"")
else:
person = '10'
persons.append(person)
for detail, score, person, title in zip(details, scores, persons, titles):
l = [] # 建一个列表,用于存放数据
try:
author = detail.split("/", 4)[0].split()[0]
translator = detail.split("/", 4)[1]
publish = detail.split("/", 4)[2]
time = detail.split("/", 4)[3].split()[0].split("-")[0]
price = ceshi_price_one(detail)
score = score if True else ""
title = title.split()[0]
except IndexError:
try:
author = detail.split("/", 3)[0].split()[0]
translator = ""
publish = detail.split("/", 3)[1]
time = detail.split("/", 3)[2].split()[0].split("-")[0]
price = ceshi_price_two(detail)
score = score if True else ""
title = title.split()[0]
except (IndexError, TypeError):
continue
except TypeError:
continue
l.append([title, score, author, price, time, publish, person, translator, tag])
print(l)
sql = "INSERT INTO all_books values(%s,%s,%s,%s,%s,%s,%s,%s,%s)" # 这是一条sql插入语句
cur.executemany(sql, l) # 执行sql语句,并用executemany()函数批量插入数据库中
conn.commit() # 提交
# mains("https://book.douban.com/tag/科技?start=0&type=T")
# 将Python连接到MySQL中的python数据库中
conn = pymysql.connect(user="root", password="你的数据库user密码(不一定是root用户)", database="douban_db", charset='utf8')
cur = conn.cursor()
cur.execute('DROP TABLE IF EXISTS all_books') # 如果数据库中有all_books的数据库则删除
sql = """CREATE TABLE all_books(
title CHAR(255) NOT NULL,
score CHAR(255),
author CHAR(255),
price CHAR(255),
time CHAR(255),
publish CHAR(255),
person CHAR(255),
translator CHAR(255),
tag CHAR(255)
)ENGINE = InnoDB DEFAULT CHARSET = utf8;"""
cur.execute(sql) # 执行sql语句,新建一个all_books的数据库
start = time.process_time()
# 设置一个时钟,这样我们就能知道我们爬取了多长时间了
channel = get_tag()
for urls in channel:
urlss = [urls + "?start={}&type=T".format(str(i)) for i in range(0, 980, 20)] # 从channel中提取url信息,并组装成每一页的链接
for url in urlss:
mains(url) # 执行主函数,开始爬取
print(url) # 输出要爬取的链接,这样我们就能知道爬到哪了,发生错误也好处理
time.sleep(int(format(random.randint(2, 5)))) # 设置一个随机数时间,每爬一个网页可以随机的停一段时间,防止IP被封
end = time.process_time()
print('Time Usage:', end - start) # 爬取结束,输出爬取时间
count = cur.execute('select * from all_books')
print('has %s record' % count) # 输出爬取的总数目条数
# 释放数据连接
if cur:
cur.close()
if conn:
conn.close()
参与讨论
(Participate in the discussion)
参与讨论