1. 查看谷歌浏览器的版本号.
2. 从网站:http://chromedriver.storage.googleapis.com/index.html 下载与版本号一致的chromedriver
3. 将下载的包解压后,将chromedriver.exe拷贝到python.exe所在目录.
4. 安装selenium包,pip install selenium
例子: 河北师大新闻(304页面,无法通过正常手段抓取)
# 导入浏览器驱动
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
# 实例化一个option对象
browser_option = webdriver.ChromeOptions()
# 实例化一个browser对象
browser = webdriver.Chrome()
# 打开网址
url = 'http://news.hebtu.edu.cn/a/zhxw/index.html'
browser.get(url)
# 通过xpath定位元素,旧方法find_elements_by_xpath(警告:要删除),用最新的方法.
titles = browser.find_elements(By.XPATH,'//div[@class="item"]/ul/li/a')
# 遍历找到的对象,提取文本
for title in titles:
print(title.get_attribute('textContent'))
抓取下一页的内容
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
# 实例化配置对象
browser_option = webdriver.ChromeOptions()
# 实例化浏览器对象
browser = webdriver.Chrome()
# 请求网址
url = "http://news.hebtu.edu.cn/a/zhxw/index.html"
browser.get(url)
# 窗口最大化
browser.maximize_window()
# 找到下一页的链接元素
next_page = browser.find_element(By.XPATH, '//span[@class="pagebox_next"]/a')
next_page.click()
titles = browser.find_elements(By.XPATH,'//div[@class="item"]/ul/li/a')
for title in titles:
print(title.get_attribute('textContent'))
连续点击下一页
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
# 实例化配置对象
browser_option = webdriver.ChromeOptions()
# 实例化浏览器对象
browser = webdriver.Chrome()
# 请求网址
url = "http://news.hebtu.edu.cn/a/zhxw/index.html"
browser.get(url)
# 窗口最大化
browser.maximize_window()
# 抓第一页
titles = browser.find_elements(By.XPATH, '//div[@class="item"]/ul/li/a')
for title in titles:
print(title.get_attribute('textContent'))
# 抓其他页面
for i in range(5):
time.sleep(0.5)
print("---------------------我是分隔符----------------------------")
next_page = browser.find_element(By.XPATH,
'//a[contains(text(),"下一页")]')
next_page.click()
titles = browser.find_elements(By.XPATH, '//div[@class="item"]/ul/li/a')
for title in titles:
print(title.get_attribute('textContent'))
终极版
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
# 实例化配置对象
browser_option = webdriver.ChromeOptions()
# 实例化浏览器对象
browser = webdriver.Chrome()
# 请求网址
url = "http://news.hebtu.edu.cn/a/zhxw/index.html"
browser.get(url)
# 窗口最大化
browser.maximize_window()
# 抓第一页
titles = browser.find_elements(By.XPATH, '//div[@class="item"]/ul/li/a')
for title in titles:
print(title.get_attribute('textContent'))
# 抓其他页面
while True:
try:
time.sleep(0.5)
print("---------------------我是分隔符----------------------------")
next_page = browser.find_element(By.XPATH,
'//a[contains(text(),"下一页")]')
next_page.click()
titles = browser.find_elements(By.XPATH, '//div[@class="item"]/ul/li/a')
for title in titles:
print(title.get_attribute('textContent'))
except:
print("采集结束.")
break
留言