立即登录

注册账号

联系我们

2022年4月1日 14:17

13. 函数式编程_类编程

import requests
from lxml import etree
import pymongo

def request(url,method="get"):
    if method == "get":
        r = requests.get(url)
        html = etree.HTML(r.text)
    elif method == "post":
        r = requests.post(url)
        html = etree.HTML(r.text)
    return html

def elements(html,path):
    data = html.xpath(path)
    return data


def main():
    url0 = "https://sc.chinaz.com/tupian/kejibeijing_{}.html"
    for i in range(2,5):
        url = url0.format(i)
        html = request(url)
        path = '//div[@id="container"]/div/div/a/@href'
        hrefs = elements(html,path)  
main()

类形式, 用到队列

import requests
from queue import Queue
from lxml import etree

class ChinazSpider:
    def __init__(self):
        self.url0 = "https://sc.chinaz.com/tupian/kejibeijing_{}.html"
        self.url_list = Queue()

    def get_url_list(self):
        for i in range(2,5):
            url = self.url0.format(i)
            self.url_list.put(url)

sp = ChinazSpider()
sp.get_url_list()
sp.url_list

# 没次get都能获取队列的头部,并且指针后移
sp.url_list.get()
# 再次get,头部已经改变
sp.url_list.get()

最终版,为每个方法添加通过线程运行

import requests
from queue import Queue
from bs4 import BeautifulSoup
import threading


class ChinazSpider:
    def __init__(self):
        self.url0 = "https://sc.chinaz.com/tupian/kejibeijing_{}.html"
        self.url_list = Queue()
        self.response_list = Queue()
        self.data_list = Queue()
        self.img_list = Queue()

    def get_url_list(self):
        for i in range(2, 5):
            url = self.url0.format(i)
            print(url)
            self.url_list.put(url)

    def get_response_list(self):
        while True:
            url = self.url_list.get()
            r = requests.get(url)
            r.encoding = "utf-8"
            html = BeautifulSoup(r.text, 'lxml')
            self.response_list.put(html)
            self.url_list.task_done()

    def get_data_list(self):
        while True:
            html = self.response_list.get()
            path = '#container a img'
            data = html.select(path)
            self.data_list.put(data)
            self.response_list.task_done()

    def get_img(self):
        while True:
            imgs = self.data_list.get()
            for img in imgs:
                src = img.get('src2')
                if src:
                    src = "https:" + src
                    print(f'请求:{src}')
                    r = requests.get(src)
                    self.img_list.put(r.content)
            self.data_list.task_done()

    def save_img(self):
        n = 1
        while True:
            img = self.img_list.get()
            with open(f'{n}.jpeg', 'wb') as p:
                p.write(img)
            n += 1
            self.img_list.task_done()

    def run(self):
        thread_list = []
        t_url = threading.Thread(target=self.get_url_list)
        thread_list.append(t_url)

        for i in range(10):
            t_response = threading.Thread(target=self.get_response_list)
            thread_list.append(t_response)

        for i in range(10):
            t_data = threading.Thread(target=self.get_data_list)
            thread_list.append(t_data)

        for i in range(50):
            t_img = threading.Thread(target=self.get_img)
            thread_list.append(t_img)

        t_save = threading.Thread(target=self.save_img)
        thread_list.append(t_save)

        for t in thread_list:
            t.setDaemon(True)
            t.start()

        for q in [self.url_list,
                  self.response_list,
                  self.data_list,
                  self.img_list]:
            q.join()


if __name__ == '__main__':
    q = ChinazSpider()
    q.run()

 

留言

给我留言