立即登录

注册账号

联系我们

2022年3月10日 16:34

4. 列表页进入详情页

教案网

import requests
from bs4 import BeautifulSoup
url = "http://www.shixiaolei.com/posts/3/"
r = requests.get(url)
soup = BeautifulSoup(r.text,'lxml')
data = soup.select('div.title a')
# soup对象get(元素的属性名),可以获取元素的属性值
for d in data:
    url = "http://www.shixiaolei.com" + d.get('href')
    r = requests.get(url)
    soup = BeautifulSoup(r.text,'lxml')
    title = soup.select('div.title h1')
    date = soup.select('p.date')
    content = soup.select('.post_text')
    data = {
        'title':title[0].get_text(),
        'date':date[0].get_text(),
        'content':content[0].get_text(strip=True)
    }
    print(data)

河北科技大学

import requests
from bs4 import BeautifulSoup
url = "https://news.hebust.edu.cn/zhxw/index.htm"
r = requests.get(url)
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text,'lxml')
data = soup.select('span.newsTxt02 a')
# data
# soup对象get(元素的属性名),可以获取元素的属性值
for d in data:
#     https://news.hebust.edu.cn/zhxw/03de2c19b9dd484eaf2d54a34d13e569.htm
    url = "https://news.hebust.edu.cn/zhxw/" + d.get('href')
#     print(url)
    r = requests.get(url)
    r.encoding = 'utf-8'
    soup = BeautifulSoup(r.text,'lxml')
    title = soup.select('div.articleTitle h3')
    date = soup.select('.articleAuthor')
    content = soup.select('.article')
    data = {
        'title':title[0].get_text(),
        'date':date[0].get_text(),
        'content':content[0].get_text(strip=True)
    }
    print(data)

上海海洋大学

import requests
from bs4 import BeautifulSoup
url = "https://www.shou.edu.cn/yw/list.htm"
r = requests.get(url)
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text,'lxml')
data = soup.select('.col_news_item')
for d in data:
    "https://www.shou.edu.cn/2022/0309/c147a304911/page.htm"
#     print(d.get('href'))
    url = "https://www.shou.edu.cn" + d.get('href')
    r = requests.get(url)
    r.encoding = 'utf-8'
    soup = BeautifulSoup(r.text,'lxml')
    title = soup.select('h1.arti_title')
    date = soup.select('.arti_update')
    auther = soup.select('.editor')
    content = soup.select('.wp_articlecontent')
    data = {
        'title':title[0].get_text(),
        'date':date[0].get_text(),
        'auther':auther[0].get_text(),
        'content':content[0].get_text()
    }
    print(data)

医院

import requests
from bs4 import BeautifulSoup
url = "http://hdzxyy.com/news.asp?cid=1"
r = requests.get(url)
r.encoding = 'gb2312'
soup = BeautifulSoup(r.text,'lxml')
data = soup.select('.nei_right_con_dt span a')
# print(data)
for d in data[::2]:
#     "http://hdzxyy.com/news_info.asp?id=4082&cid=1"
#      "news_info.asp?id=4082&cid=1"
#     print(d.get('href'))
    url = "http://hdzxyy.com/" + d.get('href')
    r = requests.get(url)
    r.encoding = 'gb2312'
    soup = BeautifulSoup(r.text,'lxml')
    title = soup.select('.news_con_tit font')
#     date = soup.select('.arti_update')
#     auther = soup.select('.editor')
    content = soup.select('.news_con_con')
    data = {
        'title':title[0].get_text(),
#         'date':date[0].get_text(),
#         'auther':auther[0].get_text(),
        'content':content[0].get_text(strip=True)
    }
    print(data)

医院2

import requests
from bs4 import BeautifulSoup
url = "https://www.cyfy.cn/lists/6.html"
r = requests.get(url)
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text,'lxml')
data = soup.select('.pad .h3 a')
# print(data)
for d in data:
#     "http://hdzxyy.com/news_info.asp?id=4082&cid=1"
#      "news_info.asp?id=4082&cid=1"
#     print(d.get('href'))
    url = d.get('href')
    r = requests.get(url)
    r.encoding = 'utf-8'
    soup = BeautifulSoup(r.text,'lxml')
    title = soup.select('.t')
#     date = soup.select('.arti_update')
#     auther = soup.select('.editor')
    content = soup.select('.text')
    data = {
        'title':title[0].get_text(),
#         'date':date[0].get_text(),
#         'auther':auther[0].get_text(),
        'content':content[0].get_text(strip=True)
    }
    print(data)

 

留言

给我留言