import requests
from lxml import etree
import pymongo
def request(url,method="get"):
if method == "get":
r = requests.get(url)
html = etree.HTML(r.text)
elif method == "post":
r = requests.post(url)
html = etree.HTML(r.text)
return html
def elements(html,path):
data = html.xpath(path)
return data
def main():
url0 = "https://sc.chinaz.com/tupian/kejibeijing_{}.html"
for i in range(2,5):
url = url0.format(i)
html = request(url)
path = '//div[@id="container"]/div/div/a/@href'
hrefs = elements(html,path)
main()
类形式, 用到队列
import requests
from queue import Queue
from lxml import etree
class ChinazSpider:
def __init__(self):
self.url0 = "https://sc.chinaz.com/tupian/kejibeijing_{}.html"
self.url_list = Queue()
def get_url_list(self):
for i in range(2,5):
url = self.url0.format(i)
self.url_list.put(url)
sp = ChinazSpider()
sp.get_url_list()
sp.url_list
# 没次get都能获取队列的头部,并且指针后移
sp.url_list.get()
# 再次get,头部已经改变
sp.url_list.get()
最终版,为每个方法添加通过线程运行
import requests
from queue import Queue
from bs4 import BeautifulSoup
import threading
class ChinazSpider:
def __init__(self):
self.url0 = "https://sc.chinaz.com/tupian/kejibeijing_{}.html"
self.url_list = Queue()
self.response_list = Queue()
self.data_list = Queue()
self.img_list = Queue()
def get_url_list(self):
for i in range(2, 5):
url = self.url0.format(i)
print(url)
self.url_list.put(url)
def get_response_list(self):
while True:
url = self.url_list.get()
r = requests.get(url)
r.encoding = "utf-8"
html = BeautifulSoup(r.text, 'lxml')
self.response_list.put(html)
self.url_list.task_done()
def get_data_list(self):
while True:
html = self.response_list.get()
path = '#container a img'
data = html.select(path)
self.data_list.put(data)
self.response_list.task_done()
def get_img(self):
while True:
imgs = self.data_list.get()
for img in imgs:
src = img.get('src2')
if src:
src = "https:" + src
print(f'请求:{src}')
r = requests.get(src)
self.img_list.put(r.content)
self.data_list.task_done()
def save_img(self):
n = 1
while True:
img = self.img_list.get()
with open(f'{n}.jpeg', 'wb') as p:
p.write(img)
n += 1
self.img_list.task_done()
def run(self):
thread_list = []
t_url = threading.Thread(target=self.get_url_list)
thread_list.append(t_url)
for i in range(10):
t_response = threading.Thread(target=self.get_response_list)
thread_list.append(t_response)
for i in range(10):
t_data = threading.Thread(target=self.get_data_list)
thread_list.append(t_data)
for i in range(50):
t_img = threading.Thread(target=self.get_img)
thread_list.append(t_img)
t_save = threading.Thread(target=self.save_img)
thread_list.append(t_save)
for t in thread_list:
t.setDaemon(True)
t.start()
for q in [self.url_list,
self.response_list,
self.data_list,
self.img_list]:
q.join()
if __name__ == '__main__':
q = ChinazSpider()
q.run()
留言