爬虫爬取豆瓣书单 - 浥青城和yeolde

我用了python爬虫中很有名的第三方库beautiful soup库来完成爬取，而这一代码其实是改进了简书上一位博主的代码，他的代码是16年的，其中有很多东西放到现在已经需要改动，而最重要的爬取信息的方式还需要重写，我对这些问题进行了解决并优化了一些他的方法，于是总结在此。
import requests
from bs4 import BeautifulSoup
import time
import re
import pymysql
import random


def get_tag():
    channel = []
    headers = {'User-Agent': "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"}
    # 利用request模块进行访问获取内容
    url = "https://book.douban.com/tag/?icn=index-nav"
    wb_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(wb_data.text, "lxml")  # 解析网页信息
    tags = soup.select("table.tagCol > tbody > tr > td > a")
    # 根据CSS路径查找标签信息，CSS路径获取方法，右键-检查-copy selector，tags返回的是一个列表
    for tag in tags:
        tag = tag.get_text()  # 将列表中的每一个标签信息提取出来
        helf = "https://book.douban.com/tag/"
        # 观察一下豆瓣的网址，基本都是这部分加上标签信息，所以我们要组装网址，用于爬取标签详情页
        url = helf + str(tag)
        channel.append(url)  # 网址组装完毕，输出
    return channel


def ceshi_price_one(detail):
    price = detail.split("/", 4)[4].split()
    if re.match("USD", price[0]):
        price = float(price[1]) * 6
    elif re.match("CNY", price[0]):
        price = price[1]
    elif re.match("\A$", price[0]):
        price = float(price[1:len(price)]) * 6
    else:
        price = price[0]

    return price


def ceshi_price_two(detail):
    price = detail.split("/", 3)[3].split()
    if re.match("USD", price[0]):
        price = float(price[1]) * 6
    elif re.match("CNY", price[0]):
        price = price[1]
    elif re.match("\A$", price[0]):
        price = float(price[1:len(price)]) * 6
    else:
        price = price[0]
    return price


def mains(url):
    headers = {'User-Agent': "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"}
    wb_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(wb_data.text.encode("utf-8"), "lxml")
    tag = url.split("?")[0].split("/")[-1]
    content_list = soup.select("#subject_list > ul > li")
    titles = []
    details = []
    scores = []
    persons = []

    for lis in content_list:
        title = lis.select("div.info > h2 > a")  # 书名
        if title:
            title = title[0].get_text().replace("\n", '').replace(" ", '').strip()
        else:
            title = ''
        titles.append(title)
        detail = lis.select("div.info > div.pub")
        if detail:
            detail = detail[0].get_text().replace("\n", '').replace(" ", '').strip()
        else:
            detail = ''
        details.append(detail)
        score = lis.select("div.info > div.star.clearfix > span.rating_nums")
        if score:
            score = score[0].get_text().replace("\n", '').replace(" ", '').strip()
        else:
            score = ''
        scores.append(score)
        person = lis.select("div.info > div.star.clearfix > span.pl")  # 评价人数
        if person:
            person = person[0].get_text().replace("\n", '').replace(" ", '').strip().replace("人评价)", "").replace("(",
                                                                                                                    "")
        else:
            person = '10'
        persons.append(person)

    for detail, score, person, title in zip(details, scores, persons, titles):
        l = []  # 建一个列表，用于存放数据
        try:
            author = detail.split("/", 4)[0].split()[0]
            translator = detail.split("/", 4)[1]
            publish = detail.split("/", 4)[2]
            time = detail.split("/", 4)[3].split()[0].split("-")[0]
            price = ceshi_price_one(detail)
            score = score if True else ""
            title = title.split()[0]
        except IndexError:
            try:
                author = detail.split("/", 3)[0].split()[0]
                translator = ""
                publish = detail.split("/", 3)[1]
                time = detail.split("/", 3)[2].split()[0].split("-")[0]
                price = ceshi_price_two(detail)
                score = score if True else ""
                title = title.split()[0]
            except (IndexError, TypeError):
                continue
        except TypeError:
            continue
        l.append([title, score, author, price, time, publish, person, translator, tag])
        print(l)
        sql = "INSERT INTO all_books values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"  # 这是一条sql插入语句
        cur.executemany(sql, l)  # 执行sql语句，并用executemany()函数批量插入数据库中
        conn.commit()  # 提交


# mains("https://book.douban.com/tag/科技?start=0&type=T")
# 将Python连接到MySQL中的python数据库中
conn = pymysql.connect(user="root", password="你的数据库user密码（不一定是root用户）", database="douban_db", charset='utf8')
cur = conn.cursor()

cur.execute('DROP TABLE IF EXISTS all_books')  # 如果数据库中有all_books的数据库则删除
sql = """CREATE TABLE all_books(
        title CHAR(255) NOT NULL,
        score CHAR(255),
        author CHAR(255),
        price CHAR(255),
        time CHAR(255),
        publish CHAR(255),
        person CHAR(255),
        translator CHAR(255),
        tag CHAR(255)
 )ENGINE = InnoDB DEFAULT CHARSET = utf8;"""
cur.execute(sql)  # 执行sql语句，新建一个all_books的数据库

start = time.process_time()
# 设置一个时钟，这样我们就能知道我们爬取了多长时间了
channel = get_tag()
for urls in channel:
    urlss = [urls + "?start={}&type=T".format(str(i)) for i in range(0, 980, 20)]  # 从channel中提取url信息，并组装成每一页的链接
    for url in urlss:
        mains(url)  # 执行主函数，开始爬取
        print(url)  # 输出要爬取的链接，这样我们就能知道爬到哪了，发生错误也好处理
        time.sleep(int(format(random.randint(2, 5))))  # 设置一个随机数时间，每爬一个网页可以随机的停一段时间，防止IP被封
end = time.process_time()
print('Time Usage:', end - start)  # 爬取结束，输出爬取时间
count = cur.execute('select * from all_books')
print('has %s record' % count)  # 输出爬取的总数目条数

# 释放数据连接
if cur:
    cur.close()
if conn:
    conn.close()