正规项目,存储在数据库中, nosql mongodb 从mongo导入mysql
辅助工作, txt文本文件 excel文件
import requests
from lxml import etree
url = "http://www.hebkx.cn/indexnews/class/?113.html"
r = requests.get(url)
# 1 打印r 看是否返回Response 200
# 2 打印r.text 从文本中Ctrl + F 查找是否有我们需要的内容
# 把返回html文本对象解析成etree对象
html = etree.HTML(r.text)
# 用etree对象的xpath方法定位元素
hrefs = html.xpath('//a[@class="url"]/@href')
for href in hrefs:
url = "http://www.hebkx.cn/indexnews" + href[2:]
r = requests.get(url)
html = etree.HTML(r.text)
# 查找id为detail的div下子元素中的第1个div
title = html.xpath('//div[@id="detail"]/div[1]/text()')
content = html.xpath('//div[@class="detail"]//text()')
with open(f'科信要闻/{title[0]}.txt','w',encoding='utf-8') as f:
f.write("\n".join(content))
print(f'{title[0]}.txt存储成功.')
用while True实现翻页 ,定位 xpath ,为了学习
import requests
from lxml import etree
import string # 处理字符串
url0 = "http://www.hebkx.cn/indexnews/class/?113.html" # 原始网址
while True:
# 请求原始网址
url = url0
r = requests.get(url)
html = etree.HTML(r.text)
# 请求下一页
next_page = html.xpath('//td[@style]/a/@href')
url0 = "http://www.hebkx.cn/indexnews/class" + next_page[0][1:]
r = requests.get(url0)
html = etree.HTML(r.text)
# 请求标题网址
hrefs = html.xpath('//a[@class="url"]/@href')
for href in hrefs:
url_news = "http://www.hebkx.cn/indexnews" + href[2:]
r = requests.get(url_news)
html = etree.HTML(r.text)
# 查找id为detail的div下子元素中的第1个div
title = html.xpath('//div[@id="detail"]/div[1]/text()')
title = title[0].translate(string.punctuation) # 去除标点
title = title.replace("|","") # 去除下划线
content = html.xpath('//div[@class="detail"]//text()')
imgs = html.xpath('//div[@class="detail"]//img/@src')
with open(f'科信要闻/{title}.txt','w',encoding='utf-8') as f:
f.write("\n".join(content))
print(f'{title}.txt存储成功.')
清华综合时讯
import requests
from lxml import etree
import os
# 初始url0
url0 = "https://www.tsinghua.edu.cn/news/zhsx/{}.htm"
for i in range(739,728,-1):
url = url0.format(i)
r = requests.get(url)
html = etree.HTML(r.text)
hrefs = html.xpath('//li[@class="zttj_img_li"]/a/@href')
for href in hrefs:
url = "https://www.tsinghua.edu.cn" + href[5:]
r = requests.get(url)
r.encoding = "utf-8"
html = etree.HTML(r.text)
try:
titles = html.xpath('//p[@class="bt"]/text()')
contents = html.xpath('//div[@class="v_news_content"]//text()')
imgs = html.xpath('//div[@class="v_news_content"]//img/@src')
os.mkdir(f"清华综合时讯/{titles[0]}")
with open(f"清华综合时讯/{titles[0]}/{titles[0]}.txt",
"w",encoding="utf-8") as f:
f.write("\n".join(contents))
print(titles[0] + "---写入成功")
for img in imgs:
url = "https://www.tsinghua.edu.cn" + img
r = requests.get(url)
with open(f"清华综合时讯/{titles[0]}/{url.split('/')[-1]}",
"wb") as p:
p.write(r.content)
print(url.split('/')[-1] + "---写入成功")
except:
print('发生错误.继续采集')
continue
# json格式数据存入excel
import pandas as pd
d = '[{"name":"卢炎","age":18},{"name":"王婷","age":19}]'
data = json.loads(d)
# 1. data转换成df(DataFrame)
df = pd.DataFrame(data)
# 2. 把列名修改为中文
df.columns = ["姓名","年龄"]
# 3. 导出excel
df.to_excel('ly.xlsx')
详情页:
# 实例 聚合数据新闻头条
import requests
import pandas as pd
APPKEY = "da78dc675dc0dc40ad1e00d0554d2336"
url = f"http://v.juhe.cn/toutiao/index?type=keji&key={APPKEY}&is_filter=1"
r = requests.get(url)
data = r.json()['result']['data']
df = pd.DataFrame(data)
# 保留需要的列
df = df[["uniquekey","title","date","category","author_name"]]
# 更改列名
df.columns = ["id","标题","日期","分类","作者"]
# 导出excel
# df.to_excel('新闻头条.xlsx')
url0 = "http://v.juhe.cn/toutiao/content?key={}&uniquekey={}"
data = []
for i in df["id"]:
url = url0.format(APPKEY,i)
r = requests.get(url)
d = r.json()
title = d['result']['detail']['title']
content = d['result']['content']
data.append({
'title':title,
'content':content
})
df = pd.DataFrame(data)
df.to_excel('新闻详情.xlsx')
留言