立即登录

注册账号

联系我们

2022年3月25日 14:25

12. 数据存储

正规项目,存储在数据库中, nosql mongodb   从mongo导入mysql
辅助工作, txt文本文件 excel文件

import requests
from lxml import etree
url = "http://www.hebkx.cn/indexnews/class/?113.html"
r = requests.get(url)
# 1 打印r 看是否返回Response 200
# 2 打印r.text 从文本中Ctrl + F 查找是否有我们需要的内容 
# 把返回html文本对象解析成etree对象
html = etree.HTML(r.text)
# 用etree对象的xpath方法定位元素
hrefs = html.xpath('//a[@class="url"]/@href')
for href in hrefs:
    url = "http://www.hebkx.cn/indexnews" + href[2:]
    r = requests.get(url)
    html = etree.HTML(r.text)
    # 查找id为detail的div下子元素中的第1个div
    title = html.xpath('//div[@id="detail"]/div[1]/text()')
    content = html.xpath('//div[@class="detail"]//text()')
    with open(f'科信要闻/{title[0]}.txt','w',encoding='utf-8') as f:
        f.write("\n".join(content))
        print(f'{title[0]}.txt存储成功.')

用while True实现翻页 ,定位 xpath ,为了学习

import requests
from lxml import etree
import string # 处理字符串
url0 = "http://www.hebkx.cn/indexnews/class/?113.html" # 原始网址
while True:
    # 请求原始网址
    url = url0
    r = requests.get(url)
    html = etree.HTML(r.text)
    # 请求下一页
    next_page = html.xpath('//td[@style]/a/@href')
    url0 = "http://www.hebkx.cn/indexnews/class" + next_page[0][1:]
    r = requests.get(url0)
    html = etree.HTML(r.text)
    # 请求标题网址
    hrefs = html.xpath('//a[@class="url"]/@href')
    for href in hrefs:
        url_news = "http://www.hebkx.cn/indexnews" + href[2:]
        r = requests.get(url_news)
        html = etree.HTML(r.text)
        # 查找id为detail的div下子元素中的第1个div
        title = html.xpath('//div[@id="detail"]/div[1]/text()')
        title = title[0].translate(string.punctuation) # 去除标点
        title = title.replace("|","") # 去除下划线
        content = html.xpath('//div[@class="detail"]//text()')
        imgs = html.xpath('//div[@class="detail"]//img/@src')
        with open(f'科信要闻/{title}.txt','w',encoding='utf-8') as f:
            f.write("\n".join(content))
            print(f'{title}.txt存储成功.')

清华综合时讯

import requests
from lxml import etree
import os
# 初始url0
url0 = "https://www.tsinghua.edu.cn/news/zhsx/{}.htm"
for i in range(739,728,-1):
    url = url0.format(i)
    r = requests.get(url)
    html = etree.HTML(r.text)
    hrefs = html.xpath('//li[@class="zttj_img_li"]/a/@href')
    for href in hrefs:
        url = "https://www.tsinghua.edu.cn" + href[5:]
        r = requests.get(url)
        r.encoding = "utf-8"
        html = etree.HTML(r.text)
        try:
            titles = html.xpath('//p[@class="bt"]/text()')
            contents  = html.xpath('//div[@class="v_news_content"]//text()')
            imgs = html.xpath('//div[@class="v_news_content"]//img/@src')
            os.mkdir(f"清华综合时讯/{titles[0]}")
            with open(f"清华综合时讯/{titles[0]}/{titles[0]}.txt",
                      "w",encoding="utf-8") as f:
                f.write("\n".join(contents))
                print(titles[0] + "---写入成功")
            for img in imgs:
                url = "https://www.tsinghua.edu.cn" + img
                r = requests.get(url)
                with open(f"清华综合时讯/{titles[0]}/{url.split('/')[-1]}",
                      "wb") as p:
                    p.write(r.content)
                    print(url.split('/')[-1] + "---写入成功")
        except:
            print('发生错误.继续采集')
            continue

# json格式数据存入excel

import pandas as pd
d = '[{"name":"卢炎","age":18},{"name":"王婷","age":19}]'
data = json.loads(d)
# 1. data转换成df(DataFrame)
df = pd.DataFrame(data)
# 2. 把列名修改为中文
df.columns = ["姓名","年龄"]
# 3. 导出excel
df.to_excel('ly.xlsx')

详情页:

# 实例 聚合数据新闻头条
import requests
import pandas as pd
APPKEY = "da78dc675dc0dc40ad1e00d0554d2336"
url = f"http://v.juhe.cn/toutiao/index?type=keji&key={APPKEY}&is_filter=1"
r = requests.get(url)
data = r.json()['result']['data']
df = pd.DataFrame(data)
# 保留需要的列
df = df[["uniquekey","title","date","category","author_name"]]
# 更改列名
df.columns = ["id","标题","日期","分类","作者"]
# 导出excel
# df.to_excel('新闻头条.xlsx')
url0 = "http://v.juhe.cn/toutiao/content?key={}&uniquekey={}"
data = []
for i in df["id"]:
    url = url0.format(APPKEY,i)
    r = requests.get(url)
    d = r.json()
    title = d['result']['detail']['title']
    content = d['result']['content']
    data.append({
        'title':title,
        'content':content
    })
df = pd.DataFrame(data)
df.to_excel('新闻详情.xlsx')

 

留言

给我留言