1. 爬取文章标题
import requests
from bs4 import BeautifulSoup
url = "http://www.shixiaolei.com/2022/3/7/1-pa-chong-ru-men/"
r = requests.get(url)
soup = BeautifulSoup(r.text,'lxml')
data = soup.select('ul.tabNav li a')
for i in data:
print(i.get_text())
2. 爬取发布日期和作者
data = soup.select('p.date')
for i in data:
print(i.get_text())
3. 爬取正文
data = soup.select('.post_text')
for i in data:
print(i.get_text())
4. 爬取导航菜单
data = soup.select('ul.nav li a')
for i in data:
print(i.get_text(strip=True))
5. 爬取底部文字
data = soup.select('ul.foot li p')
for i in data:
print(i.get_text(strip=True))
6. 爬取科信新闻列表页标题
url = "http://www.hebkx.cn/jijiao/main/index.php"
r = requests.get(url)
soup = BeautifulSoup(r.text,'lxml')
data = soup.select('.mod_NewsList2 li a')
for i in data:
print(i.get_text())
7. 爬取爬取科信新闻详情页标题和正文
url = "http://www.hebkx.cn/jijiao/html/?6173.html"
r = requests.get(url)
soup = BeautifulSoup(r.text,'lxml')
data = soup.select('.newstitle')
print(data[0].get_text())
data = soup.select('.detail')
print(data[0].get_text(strip=True))
8. 爬取菜单
url = "http://www.hebkx.cn/jijiao/html/?6173.html"
r = requests.get(url)
soup = BeautifulSoup(r.text,'lxml')
data = soup.select('.menu li a')
for i in data:
print(i.get_text())
留言