自动化爬虫 playwright实战篇(tx、ali225)

人人都笑金角，人人都是金角

推荐文章：

1、https://playwright.dev/python/docs/api/class-playwright //官方文档

2、https://cuiqingcai.com/36045.html //崔庆才教程

3、https://github.com/qqq732004709/ //实战参考

4、https://www.cnblogs.com/carl-/p/15761861.html //实战参考

5、https://www.cnblogs.com/james-wangx/p/16106304.html //实战参考

案例一：tx滑块（playwright）

目标网站：aHR0cHM6Ly93d3cudXJidGl4LmhrL2xvZ2lu

1、创建Chromium实例（如果不设置为 False，默认是无头模式启动浏览器）

async with async_playwright() as p:

browser = await p.chromium.launch(headless=False, args=['--start-maximized'])

2、最大化窗口

context = await browser.new_context(viewport={"width": 1920, "height": 1080}, no_viewport=True)

3、新建标签页

page = await context.new_page()

4、加载过检测js

await page.add_init_script(js) #stealth.min.js

5、监听response事件

async def on_response(response):

if '/cap_union_new_getcapbysig' in response.url and response.status == 200:

#对背景图以及滑块图进行拦截然后保存

if 'img_index=1' in response.url:

with open("bg_picture.jpg", "wb") as f:

f.write(requests.get(response.url).content)

elif 'img_index=0' in response.url:

with open("cut_picture.png", "wb") as f:

f.write(requests.get(response.url).content)

print("response.url:", response.url)

if 'cap_union_new_verify' in response.url and response.status == 200:

#滑块通过后获取参数

result = await response.text()

print("response.url:", response.url,result)

page.on('response',on_response)

6、打开网页、触发滑块

await page.goto('aHR0cHM6Ly93d3cudXJidGl4LmhrL2xvZ2lu')

await page.wait_for_timeout(1500)

await page.click('xpath=//*[@id="root"]/div/div[3]/div/div/div[5]/div/div')

await page.wait_for_timeout(500)

await page.click('xpath=//*[@id="root"]/div/div[3]/div/div/div[8]/div[2]/div')

7、识别坐标

def get_gap_offset():

"""

识别坐标，滑块的图片需要切割

"""

det = ddddocr.DdddOcr(det=False, ocr=False, show_ad=False)

img = Image.open('cut_picture.png')

region = img.crop((160, 508, 243, 595)) #

region.save(f'cut_picture.png')

with open('bg_picture.jpg', 'rb') as f:

target_bytes = f.read()

with open('cut_picture.png', 'rb') as f:

background_bytes = f.read()

res = det.slide_match(target_bytes, background_bytes, simple_target=True)

print("识别到的坐标位置：", res)

distance = int(res['target'][0])

return distance

8、找到滑动起始点，并滑动

async def move_down(page):

#定位iframe

new_frame = page.frame_locator('iframe[id="tcaptcha_iframe_dy"]')

#定位起始点

move_tag = new_frame.locator('xpath=//*[@id="tcOperation"]/div[6]')

#找到这个元素在当前页面的坐标

box = await move_tag.bounding_box()

print("目前点击的位置",box)

# 讲鼠标移动到到其实元素的中心

await page.mouse.move(box["x"] + box["width"] / 2, box["y"] + box["height"] / 2)

# 按下鼠标

await page.mouse.down()

#延时1.2s

await page.wait_for_timeout(1200)

# 这里获取到x坐标中心点位置

x = box["x"] + box["width"] / 2

#识别到坐标后与网页上的比例

distance = int(get_gap_offset()/1.97)-30

#轨迹

move_distance = get_track_list(distance)

print("最终坐标：",distance,"轨迹：",move_distance)

for i in move_distance:

x += i

await page.mouse.move(x, box["y"])

await page.mouse.up()

9、关闭窗口

await browser.close()

至此tx滑块的分析就结束了

然后我还写了一版selenium的，相比于playwright就会麻烦一些案例一：tx滑块（selenium）

对于我们日常使用而言两者主要区别在于：

1、selenium只支持同步，playwright可以支持异步的

2、操作iframe,selenium来回切换iframe非常麻烦，而playwright只需要定位元素即可

2、在监听请求这一点上，playwright的page.on非常好用，而selenium一般是借助browsermobproxy通过代理的方式进行拦截

使用方式：(1)https://github.com/lightbody/browsermob-proxy/releases，下载并解压

(2)安装证书，参考链接https://www.bilibili.com/read/cv21263644/

(3)调用方式

server = Server('browsermob-proxy-2.1.4/bin/browsermob-proxy')

server.start()

proxy = server.create_proxy(params={'trustAllServers':'true'})

option = ChromeOptions()

option.add_argument('--proxy-server={0}'.format(self.proxy.proxy))

driver = webdriver.Chrome(options=option)

这里就不细致讲解了，主要代码如下

class Tencent():

def __init__(self):

server = Server('browsermob-proxy-2.1.4/bin/browsermob-proxy')

server.start()

self.proxy = server.create_proxy(params={'trustAllServers':'true'})

self.url = 'aHR0cHM6Ly93d3cudXJidGl4LmhrL2xvZ2lu'

option = ChromeOptions()

option.add_experimental_option('excludeSwitches', ['enable-automation'])

option.add_experimental_option('useAutomationExtension', False)

option.add_argument('--proxy-server={0}'.format(self.proxy.proxy))

self.proxy.new_har(options={'captureContent': True,'captureHeaders': True})

self.driver = webdriver.Chrome(options=option)

self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {

'source': 'Object.defineProperty(navigator,"webdriver",{get: () => undefined})'

})

with open('stealth.min.js') as f:

js = f.read()

self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})

self.driver.maximize_window()

self.det = ddddocr.DdddOcr(det=False, ocr=False, show_ad=False)

self.headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'}

def index(self):

"""

主流程

"""

self.driver.get(self.url)

time.sleep(5)

print("正在打开网页~~~")

self.driver.find_element(by=By.XPATH, value=f'//*[@id="root"]/div/div[3]/div/div/div[5]/div/div').click()

time.sleep(1)

self.driver.find_element(by=By.XPATH, value=f'//*[@id="root"]/div/div[3]/div/div/div[8]/div[2]/div').click()

time.sleep(5)

self.driver.switch_to.frame('tcaptcha_iframe_dy')

bg_style = self.driver.find_element('id','slideBg').get_attribute("style")

cut_style = self.driver.find_element(by=By.XPATH, value=f'//*[@id="tcOperation"]/div[8]').get_attribute("style")

bg_url = re.findall('url\("(.*?)"\)',str(bg_style))[0]

cut_url = re.findall('url\("(.*?)"\)', str(cut_style))[0]

print("获取到背景图片url:",bg_url)

print("获取到滑块图片url:",cut_url)

with open("bg_picture.jpg", "wb") as f:

f.write(requests.get(bg_url).content)

with open("cut_picture.png", "wb") as f:

f.write(requests.get(cut_url).content)

def get_gap_offset(self):

"""

识别坐标

"""

img = Image.open('cut_picture.png')

region = img.crop((160, 508, 243, 595)) #

region.save(f'cut_picture.png')

with open('bg_picture.jpg', 'rb') as f:

target_bytes = f.read()

with open('cut_picture.png', 'rb') as f:

background_bytes = f.read()

res = self.det.slide_match(target_bytes, background_bytes, simple_target=True)

print("识别到的坐标位置：",res)

distance = int(res['target'][0])

return distance

def get_track(self, offset):

'''

计算滑块的移动轨迹

'''

offset -= 30 # 滑块并不是从0开始移动，有一个初始值

a = offset / 4

track = [a, a, a, a]

return track

def shake_mouse(self):

"""

模拟人手释放鼠标抖动

"""

ActionChains(self.driver).move_by_offset(xoffset=-2, yoffset=0).perform()

ActionChains(self.driver).move_by_offset(xoffset=2, yoffset=0).perform()

def operate_slider(self, track):

"""

拖动滑块

:param track: 运动轨迹

"""

# 定位到拖动按钮

slider_bt = self.driver.find_element(by=By.XPATH,value ='//*[@id="tcOperation"]/div[6]')

# 点击拖动按钮不放

ActionChains(self.driver).click_and_hold(slider_bt).perform()

# 按正向轨迹移动

for i in track:

ActionChains(self.driver).move_by_offset(xoffset=i, yoffset=0).perform()

time.sleep(random.random() / 100) # 每移动一次随机停顿0-1/100秒之间骗过了极验，通过率很高

time.sleep(random.random())

# 按逆向轨迹移动

back_tracks = [-1, -0.5, -1]

for i in back_tracks:

time.sleep(random.random() / 100)

ActionChains(self.driver).move_by_offset(xoffset=i, yoffset=0).perform()

# 模拟人手抖动

self.shake_mouse()

time.sleep(random.random())

# 松开滑块按钮

ActionChains(self.driver).release().perform()

time.sleep(2)

def login(self):

'''

实现主要的登陆逻辑

'''

self.index()

distance = self.get_gap_offset()

distance = int(distance/1.97)

track = self.get_track(distance)

self.operate_slider(track)

result = self.proxy.har

for entry in result['log']['entries']:

if entry['request']['url'] == 'https://t.captcha.qq.com/cap_union_new_verify':

print(entry['request']['url'],entry['response']['content'])

print(entry['response']['content']['text'])

案例二：阿里225(playwright)

目标网站：aHR0cHM6Ly9wYXNzcG9ydC5kYW1haS5jbi9sb2dpbg==

1、前面的初始化流程

async with async_playwright() as p:

browser = await p.chromium.launch(headless=False, args=['--start-maximized'])

context = await browser.new_context(viewport={"width": 1920, "height": 1080}, no_viewport=True)

context.set_default_timeout(8000)

page = await context.new_page()

await page.add_init_script(js)

print("打开网页~~~")

await page.goto('aHR0cHM6Ly9wYXNzcG9ydC5kYW1haS5jbi9sb2dpbg==')

await page.wait_for_timeout(1000)

page.on('response', on_response)

2、输入账号密码

#这里需要注意这个iframe,前面的iframe和后面出滑块之后的iframe属于包含关系

new_frame = page.frame_locator('iframe[id="alibaba-login-box"]')

await page.wait_for_timeout(1000)

await new_frame.locator('#fm-login-id').fill("正确的手机号码")

await page.wait_for_timeout(1000)

await new_frame.locator('#fm-login-password').fill("错误的密码")

await page.wait_for_timeout(1000)

await new_frame.get_by_role("button", name="登录").click()

await page.wait_for_timeout(1000)

3、强制弹出滑块，并判断

#这里为了让它出滑块要先输出错误的密码，然后一直click，直到出滑块为止

while True:

try:

new_frame2 = new_frame.frame_locator('iframe[id="baxia-dialog-content"]')

move_tag = new_frame2.locator('xpath=//*[@id="nc_1_n1z"]')

number = await move_tag.count()

if number>=1:

box = await move_tag.bounding_box()

print("目前点击的位置", box)

break

else:

print(f"没出滑块，重新点击")

await page.wait_for_timeout(1000)

await new_frame.get_by_role("button", name="登录").click()

except:

await new_frame.get_by_role("button", name="登录").click()

4、定位以及滑动

async def move_down(page,box):

await page.mouse.move(box["x"] + box["width"] / 2, box["y"] + box["height"] / 2)

await page.mouse.down() # 按下鼠标

await page.wait_for_timeout(1200)

x = box["x"] + box["width"] / 2 # 这里获取到x坐标中心点位置

move_distance = get_track_list(265)

print("轨迹：",move_distance)

for i in move_distance:

x += i

await page.mouse.move(x, box["y"])

await page.mouse.up()

await page.wait_for_timeout(500)

至此ali滑块的分析就结束了

当脚下的路走起来比以前轻松了，是不是该问自己是否在走下坡路了，我也不知道呢

推荐链接

评论可见，请评论后查看内容，谢谢！！！

您阅读本篇文章共花了：

金钥匙

自动化爬虫 playwright实战篇(tx、ali225)

python selenium mongodb 东方财富股吧发帖与评论爬虫

开发语言 Java程序中使用 Jsoup 爬虫( 简单示例 )(1)

发表评论取消回复

金钥匙

自动化 爬虫 playwright实战篇(tx、ali225)

python selenium mongodb 东方财富股吧发帖与评论爬虫

开发语言 Java程序中使用 Jsoup 爬虫( 简单示例 )(1)

相关文章

发表评论取消回复

自动化爬虫 playwright实战篇(tx、ali225)