您的当前位置:首页正文

爬简书7日热门

来源:华拓网
Paste_Image.png
# -*-coding:utf-8-*-
import sys
import csv
import requests
from bs4 import BeautifulSoup

# 这两行解决编码问题
reload(sys)
sys.setdefaultencoding('utf-8')

base_url = 

# 请求单个页面
def getHtml(url):
    r = requests.get(url)
    return r.text

# 解析单个页面,返回该页面所有文章的抓取字段
def parseHtml(html):
    soup = BeautifulSoup(html, 'lxml')
    articles = []
    for article in soup.find_all(class_='content'):
        title = article.find(class_='title').string
        link =  + article.find(class_='title').get('href')
        author = article.find(class_='blue-link').string
        time = article.span['data-shared-at']
        meta = article.find(class_='meta').find_all(['a', 'span'])
        metas = []
        for item in meta:
            metas.append(item.get_text().strip())
        read = metas[0]
        comment = metas[1]
        like = metas[2]
        try:
            money = metas[3]
        except:
            money = None
        articles.append([title, author,time, read, comment, like, money,link])

    return articles

# 写入csv
def writeCSV(file,data_list):
    with open(file,'wb') as f:
        writer = csv.writer(f)
        writer.writerow(['文章标题', '作者', '时间','阅读量', '评论', '喜欢', '赞赏数','文章地址'])
        for data in data_list:
            for row in data:
                writer.writerow(row)

if __name__ == '__main__':
    data_list = []
    for i in range(1,7):
        url = base_url + '?page={}'.format(i)
        html = getHtml(url)
        data = parseHtml(html)
        data_list.append(data)
    writeCSV('jianshu.csv',data_list)