爬虫学习-selenium工具使用

文章目录

前言selenium工具使用相关操作窗口跳转无头浏览器(后台运行)超级鹰(破解验证码)处理12306登陆验证

总结

前言

本博客仅做学习笔记，如有侵权，联系后即刻更改

科普：

参考网址

selenium工具使用

自动化测试工具：selenium引入

首先在pycharm里面下载安装相关模块

pip install selenium

它要启动你电脑上的浏览器, 这就需要⼀个驱动程序来辅助

Edge驱动安装导航Edge驱动运行异常解决

# 能不能让我的程序连接到浏览器 . 让浏览器来完成各种复杂的操作, 我们只接受最终的结果

# selenium: 自动化测试工具

# 可以: 打开浏览器. 然后像人一样去操作浏览器

# 程序员可以从selenium中直接提取网页上的各种信息

# 让selenium启动Edge浏览器

from selenium.webdriver import Edge

# msedgedriver.exe下载后放到浏览器的目录下

# # 1.创建浏览器对象

web = Edge("C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe")

# 2.打开一个网址

web.get("http://www.baidu.com")

print(web.title)

相关操作

进入拉勾网查询python相关岗位

查找和获取网页相关位置内容的xpath点击按钮和输入相关文本

from selenium.webdriver import Edge

from selenium.webdriver.common.keys import Keys

import time

web = Edge("C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe")

web.get("http://lagou.com")

# 找到某个元素. 点击它

el = web.find_element_by_xpath('//*[@id="cboxClose"]')

el.click() # 点击事件

time.sleep(1) # 让浏览器缓一会儿

# 找到输入框. 输入python => 输入回车/点击搜索按钮

web.find_element_by_xpath('//*[@id="search_input"]').send_keys("python", Keys.ENTER)

time.sleep(1)

# 查找存放数据的位置. 进行数据提取

# 找到页面中存放数据的所有的li

li_list = web.find_elements_by_xpath('//*[@id="s_position_list"]/ul/li')

for li in li_list:

job_name = li.find_element_by_tag_name("h3").text

job_price = li.find_element_by_xpath("./div/div/div[2]/div/span").text

company_name = li.find_element_by_xpath('./div/div[2]/div/a').text

print(company_name, job_name, job_price)

窗口跳转

from selenium.webdriver import Chrome

from selenium.webdriver.common.keys import Keys

import time

web = Chrome()

# web.get("http://lagou.com")

# web.find_element_by_xpath('//*[@id="cboxClose"]').click()

# time.sleep(1)

# web.find_element_by_xpath('//*[@id="search_input"]').send_keys("python", Keys.ENTER)

# time.sleep(1)

# web.find_element_by_xpath('//*[@id="s_position_list"]/ul/li[1]/div[1]/div[1]/div[1]/a/h3').click()

# # 如何进入到进窗口中进行提取

# # 注意, 在selenium的眼中. 新窗口默认是不切换过来的.

# web.switch_to.window(web.window_handles[-1])

# # 在新窗口中提取内容

# job_detail = web.find_element_by_xpath('//*[@id="job_detail"]/dd[2]/div').text

# print(job_detail)

# # 关掉子窗口

# web.close()

# # 变更selenium的窗口视角. 回到原来的窗口中

# web.switch_to.window(web.window_handles[0])

# print(web.find_element_by_xpath('//*[@id="s_position_list"]/ul/li[1]/div[1]/div[1]/div[1]/a/h3').text)

# 如果页面中遇到了 iframe如何处理

web.get("https://www.91kanju.com/vod-play/541-2-1.html")

# 处理iframe的话. 必须先拿到iframe. 然后切换视角到iframe . 再然后才可以拿数据

iframe = web.find_element_by_xpath('//*[@id="player_iframe"]')

web.switch_to.frame(iframe) # 切换到iframe

# web.switch_to.default_content() # 切换回原页面

tx = web.find_element_by_xpath('//*[@id="main"]/h3[1]').text

print(tx)

无头浏览器(后台运行)

from selenium.webdriver import Chrome

from selenium.webdriver.chrome.options import Options

from selenium.webdriver.support.select import Select

import time

# 准备好参数配置

opt = Options()

opt.add_argument("--headless")

opt.add_argument("--disbale-gpu")

web = Chrome(options=opt) # 把参数配置设置到浏览器中

web.get("https://www.endata.com.cn/BoxOffice/BO/Year/index.html")

time.sleep(2)

# # 定位到下拉列表

# sel_el = web.find_element_by_xpath('//*[@id="OptionDate"]')

# # 对元素进行包装, 包装成下拉菜单

# sel = Select(sel_el)

# # 让浏览器进行调整选项

# for i in range(len(sel.options)): # i就是每一个下拉框选项的索引位置

# sel.select_by_index(i) # 按照索引进行切换

# time.sleep(2)

# table = web.find_element_by_xpath('//*[@id="TableList"]/table')

# print(table.text) # 打印所有文本信息

# print("===================================")

# print("运行完毕. ")

# web.close()

# 如何拿到页面代码Elements(经过数据加载以及js执行之后的结果的html内容)

print(web.page_source)

超级鹰(破解验证码)

from selenium.webdriver import Chrome

from chaojiying import Chaojiying_Client

import time

web = Chrome()

web.get("http://www.chaojiying.com/user/login/")

# 处理验证码

img = web.find_element_by_xpath('/html/body/div[3]/div/div[3]/div[1]/form/div/img').screenshot_as_png

chaojiying = Chaojiying_Client('18614075987', '6035945', '914467')

dic = chaojiying.PostPic(img, 1902)

verify_code = dic['pic_str']

# 向页面中填入用户名, 密码, 验证码

web.find_element_by_xpath('/html/body/div[3]/div/div[3]/div[1]/form/p[1]/input').send_keys("18614075987")

web.find_element_by_xpath('/html/body/div[3]/div/div[3]/div[1]/form/p[2]/input').send_keys("6035945")

web.find_element_by_xpath('/html/body/div[3]/div/div[3]/div[1]/form/p[3]/input').send_keys(verify_code)

time.sleep(5)

# 点击登录

web.find_element_by_xpath('/html/body/div[3]/div/div[3]/div[1]/form/p[4]/input').click()

处理12306登陆验证

from selenium.webdriver import Chrome

from selenium.webdriver.common.action_chains import ActionChains

from selenium.webdriver.chrome.options import Options

from chaojiying import Chaojiying_Client

import time

# 初始化超级鹰

chaojiying = Chaojiying_Client('18614075987', '6035945', '914467')

# 如果你的程序被识别到了怎么办?

# 1.chrome的版本号如果小于88 在你启动浏览器的时候(此时没有加载任何网页内容), 向页面嵌入js代码. 去掉webdriver

# web = Chrome()

# web.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {

# "source": """

# navigator.webdriver = undefined

# Object.defineProperty(navigator, 'webdriver', {

# get: () => undefined

# })

# """

# })

# web.get(xxxxxxx)

# 2.chrome的版本大于等于88

option = Options()

# option.add_experimental_option('excludeSwitches', ['enable-automation'])

option.add_argument('--disable-blink-features=AutomationControlled')

web = Chrome(options=option)

web.get("https://kyfw.12306.cn/otn/resources/login.html")

time.sleep(2)

web.find_element_by_xpath('/html/body/div[2]/div[2]/ul/li[2]/a').click()

time.sleep(3)

# 先处理验证码

verify_img_element = web.find_element_by_xpath('//*[@id="J-loginImg"]')

# 用超级鹰去识别验证码

dic = chaojiying.PostPic(verify_img_element.screenshot_as_png, 9004)

result = dic['pic_str'] # x1,y1|x2,y2|x3,y3

rs_list = result.split("|")

for rs in rs_list: # x1,y1

p_temp = rs.split(",")

x = int(p_temp[0])

y = int(p_temp[1])

# 要让鼠标移动到某一个位置. 然后进行点击

# 醒了 -> 掀开被子 -> 坐起来 -> 穿鞋子 -> 穿衣服 -> 开始执行动作

ActionChains(web).move_to_element_with_offset(verify_img_element, x, y).click().perform()

time.sleep(1)

# 输入用户名和密码

web.find_element_by_xpath('//*[@id="J-userName"]').send_keys("123456789")

web.find_element_by_xpath('//*[@id="J-password"]').send_keys("12346789")

# 点击登录

web.find_element_by_xpath('//*[@id="J-login"]').click()

time.sleep(5)

# 拖拽

btn = web.find_element_by_xpath('//*[@id="nc_1_n1z"]')

ActionChains(web).drag_and_drop_by_offset(btn, 300, 0).perform()

总结

小小励志

有些事你现在不做，一辈子都不会做了。如果你想做一件事，全世界都会为你让路。《搭车去柏林》

金钥匙

爬虫学习-selenium工具使用

测试工具程序人生自动化测试 python 软件测试单元测试强！1.8k star，推荐一款将Requests和Selenium无缝衔接的爆款工具！

hadoop 数据仓库构建 hive 时间维表

发表评论取消回复

金钥匙

爬虫学习-selenium工具使用

测试工具 程序人生 自动化测试 python 软件测试 单元测试 强！1.8k star，推荐一款将Requests和Selenium无缝衔接的爆款工具！

hadoop 数据仓库 构建 hive 时间维表

相关文章

发表评论取消回复

测试工具程序人生自动化测试 python 软件测试单元测试强！1.8k star，推荐一款将Requests和Selenium无缝衔接的爆款工具！

hadoop 数据仓库构建 hive 时间维表