面向对象抓取糗事百科保存到mongo

Robin 5969次浏览

摘要:用requests来发起请求,lxml的etree提供的xpath解析html,pymongo来保存

直接上代码吧,有不懂的可以留言或加QQ

import requests
from lxml import etree
import pymongo

class Spider:
    def __init__(self):
        self.client = pymongo.MongoClient('localhost')
        self.db = self.client['qiushi']
        self.collection_name = 'qiushi'

    def run(self, url):
        html = self.getHtml(url)
        tree = etree.HTML(html)
        contents = tree.xpath('//div[@id="content-left"]/div')
        items = {}
        for content in contents:
            if not content.xpath('a[1]/div/span[2]/text()'):
                items['content'] = content.xpath('a[1]/div/span[1]/text()')[0]
                items['likes'] = content.xpath('div[@class="stats"]/span[@class="stats-vote"]/i/text()')[0]
                try:
                    items['image'] = 'https:' + content.xpath('div[@class="thumb"]/a/img/@src')[0]
                except:
                    items['image'] = ''
                self.save_mongo(items)

    def getHtml(self, url):
        html = requests.get(url).text
        return html

    def save_mongo(self, items):
        self.db[self.collection_name].update({'content': items['content']}, dict(items), True)

    def close_mongo(self):
        self.client.close()

    def getCate(self, url):
        html = self.getHtml(url)
        tree = etree.HTML(html)
        cate_names = tree.xpath('//div[@id="menu"]/a/text()')
        cate_urls = tree.xpath('//div[@id="menu"]/a/@href')
        cates = {}
        for name, cate_url in zip(cate_names, cate_urls):
            if cate_url == '/':
                cate_url = '/8hr/'
            cates[name] = url + cate_url + 'page/{page}'
        return cates

if __name__ == '__main__':
    url = 'https://www.qiushibaike.com'
    spider = Spider()
    cates = spider.getCate(url)
    for cate in cates:
        for page in range(1, 35):
            print('--------正在抓取糗事百科 - 【%s】  第%s页--------' % (cate, page))
            spider.run(cates[cate].format(page=page))


image.png


image.png


原创文章转载请注明出处。


相关文章