这是利用业余时间编写的,基于selenium的微博关键字搜索结果全自动爬虫,支持自定义搜素关键字、搜索起始时间、爬取起始页数(以实现中断后接上次继续爬取)。爬取内容包括微博账号、发文时间、发送平台、微博内容、转发次数、评论次数、点赞次数、原博地址,并导出为CSV文件。自己测试持续稳定运行24小时以上,连续爬取微博数量3W以上。

下面放上目前版本v1.0.3的代码,仅作为个人练手项目,随缘更新:

from selenium import webdriver

from selenium.webdriver.chrome.options import Options

from selenium.webdriver.common.by import By

from lxml import etree

import pandas

import requests

import json

import datetime

import random

import time

import re

import selenium.common.exceptions

# from get_cookie import get_cookie

class GetWeibo:

browser_options = Options()

# 不显示浏览器界面

browser_options.add_argument("--headless")

# 不使用沙盒模式

# browser_options.add_argument("--no-sandbox")

browser = webdriver.Chrome(chrome_options=browser_options)

headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '

'AppleWebKit/537.36 (KHTML, like Gecko) Chrome'

'/103.0.0.0 Safari/537.36'}

print("浏览器已成功创建。")

def __init__(self):

self.base_url = 'https://s.weibo.com/weibo'

self.keywords = None

self.origin = None

self.time_judge = None

# 如果cookie失效,请重新运行

# get_cookie()

self.main()

def open_search(self):

self.browser.get(self.base_url)

self.browser.delete_all_cookies()

time.sleep(8)

print(f'微博搜索页面{self.browser.current_url}已成功打开...')

kw = self.browser.find_element(By.XPATH, ('//div[@class="searchbox"]/div[@class="search-input"]/'

'input[@type="text"]'))

self.keywords = input('请输入微博搜索的关键词,按回车键确认:')

print(f'搜索关键词为:{self.keywords}。')

while True:

self.origin = input('搜索所有微博请输入1,按回车键确认,直接按回车键则只搜索原创微博:')

if self.origin == '':

self.origin = '&scope=ori'

print('仅搜索原创微博。')

break

elif self.origin == '1':

self.origin = '&typeall=1'

print('搜索全部微博。')

break

else:

print('输入错误,请重新输入。')

continue

while True:

date_time = input('请按年-月-日-时的格式输入抓取微博的发布截止时间(示例:2022-08-03-07),按回车键确认,直接按回车键则截止时间为当前时间:')

if date_time == '':

date_format = '%Y-%m-%d-%H'

date_time = datetime.datetime.now().strftime(date_format)

date_time = (datetime.datetime.strptime(date_time, date_format) + (

datetime.timedelta(hours=+1))).strftime(date_format)

print('截止时间为:当前时间。')

break

elif re.match(r'(2\d{3})-'

r'('

r'('

r'(0[13578]|1[02])-(0[1-9]|[12]\d|3[01])-'

r')|'

r'('

r'(0[469]|11)-(0[1-9]|[12]\d|30)-'

r')|'

r'('

r'02-(0[1-9]|1[\d|2[0-8])-'

r')'

r'('

r'(0)|([01]\d)|(2[0-3])'

r')'

r')', date_time) is None:

print('时间格式输入错误,请重新输入!')

continue

else:

print(f'截止时间为:{date_time}。')

break

self.time_judge = datetime.datetime.strptime(date_time, '%Y-%m-%d-%H')

while True:

page_begin = input('请输入微博列表的抓取起始页(0至50之间),按回车键确认,直接按回车键从第1页开始:')

if page_begin == '':

page_begin = ''

print('抓取起始页为:第1页。')

break

elif re.match(r'([1-4]\d|50)', page_begin) is None:

print('抓取起始页输入错误,请重新输入!')

continue

else:

print(f'抓取起始页为:第{page_begin}页。')

page_begin = '&page=' + str(page_begin)

break

kw.send_keys(self.keywords)

click_search = self.browser.find_element(By.XPATH, '//div[@class="searchbox"]/button[@class="s-btn-b"]')

click_search.click()

time.sleep(1)

click_list = self.browser.find_element(By.XPATH, '//div[@class ="m-main-nav"]/ul/li[2]/a')

click_list.click()

time.sleep(1)

print(f'微博列表页面{self.browser.current_url}已成功打开,列表按时间倒序排序。')

with open('cookies.txt', 'r') as f:

cookies_list = json.load(f)

for cookie in cookies_list:

if isinstance(cookie.get('expiry'), float):

cookie['expiry'] = int(cookie['expiry'])

self.browser.add_cookie(cookie)

self.browser.refresh()

date_format = '%Y-%m-%d-%H'

date_past = (datetime.datetime.strptime(date_time, date_format) + datetime.timedelta(days=-31)).strftime(

date_format)

url = self.browser.current_url

url_change = re.search(r'(.*)(?=q=)', url)

url = url_change.group() + f'q={self.keywords}{self.origin}&suball=1×cope=custom:{date_past}:{date_time}&Refer=g{page_begin}'

print(f'本次抓取的开始时间是:{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

search_times = 0

return url, search_times

def auto_search(self, url, search_times):

if url != self.browser.current_url:

self.browser.get(url)

print(f'微博列表页面{self.browser.current_url}已打开,抓取中...')

time.sleep(1)

data = etree.HTML(self.browser.page_source)

post_url = data.xpath('//p[@class="from"]/a[1]/@href')

if len(post_url) == 0:

post_url = data.xpath('//div[@class="from"]/a[1]/@href')

df = pandas.DataFrame(

columns=['微博账号', '发文时间', '发送平台', '微博内容', '转发次数', '评论次数', '点赞次数', '原博地址'])

for index, url_single in enumerate(post_url):

url = 'https:' + url_single

print(url)

while True:

self.browser.get(url)

time.sleep(1)

post = etree.HTML(self.browser.page_source)

names = post.xpath('//a[@usercard]/span[@title]/text()')

print(names)

time_ = post.xpath('//a[@title][@href][@class][1]/text()')

time_ = f'20{"".join(time_).strip()}'

if time_ == '20':

print('解析错误,正在处理...')

time.sleep(60)

self.browser.back()

continue

elif index == 0:

try:

time_mark = datetime.datetime.strptime(time_, '%Y-%m-%d %H:%M')

if time_mark > self.time_judge:

next_time = (datetime.datetime.now() + (datetime.timedelta(seconds=+3601))).strftime(

'%Y-%m-%d %H:%M:%S')

_target_time = datetime.datetime.strptime(next_time, '%Y-%m-%d %H:%M:%S')

_time = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')

print(f'达到单时段最大次数限制。当前时间是{_time},目前已抓取{search_times}条微博,下次抓取时间:{next_time},现在睡眠中...')

while datetime.datetime.now() < _target_time:

time.sleep(60)

self.browser.back()

click_next = None

while True:

try:

click_next = self.browser.find_element(By.XPATH,

'//div[@class="m-page"]/div/a[@class="next"]')

break

except selenium.common.exceptions.NoSuchElementException as E:

print(repr(E))

next_time = (

datetime.datetime.now() + (datetime.timedelta(seconds=+3601))).strftime(

'%Y-%m-%d %H:%M:%S')

_target_time = datetime.datetime.strptime(next_time, '%Y-%m-%d %H:%M:%S')

_time = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')

print(

f'达到单时段最大次数限制。当前时间是{_time},目前已抓取{search_times}条微博,下次抓取时间:{next_time},现在睡眠中...')

while datetime.datetime.now() < _target_time:

time.sleep(120)

self.browser.back()

continue

click_next.click()

url = self.browser.current_url

return url, search_times

except ValueError as VE:

print(repr(VE))

time.sleep(60)

self.browser.back()

continue

break

print(time_)

from1 = post.xpath('//div[@class="woo-box-flex"]/div[@title]/text()')

from2 = post.xpath('//div[@class="woo-box-flex"]/div[contains(@class, "head-info_cut")]/text()')

from1 = ''.join(from1)

from2 = ''.join(from2)

from_all = from1 + from2

blogs = post.xpath('//div[contains(@class, "detail_text")]/div/text()')

blogs = ''.join(blogs)

forward = post.xpath('//span[@class="woo-pop-ctrl"]/div/span/text()')

forward = [0 if i == ' 转发 ' else i for i in forward]

if '万' not in forward:

pass

else:

forward = ''.join(forward)

forward = int(float(forward[0:-1]) * 10000)

comments = post.xpath('//div[contains(@class, "woo-box-item-flex toolbar_item")]'

'/div[contains(@class, "woo-box-flex")]/span/text()')

comments = [0 if i == ' 评论 ' else i for i in comments]

if '万' not in comments:

pass

else:

comments = ''.join(comments)

comments = int(float(comments[0:-1]) * 10000)

likes = post.xpath('//div[contains(@class, "toolbar_likebox")]/button/span[@class="woo-like-count"]/text()')

likes = [0 if i == '赞' else i for i in likes]

if '万' not in likes:

pass

else:

likes = ''.join(likes)

likes = int(float(likes[0:-1]) * 10000)

key_list = ['微博账号', '发文时间', '发送平台', '微博内容', '转发次数', '评论次数', '点赞次数', '原博地址']

info_list = [names, time_, from_all, blogs, forward, comments, likes, url]

csv_info = dict(zip(key_list, info_list))

df1 = pandas.DataFrame(csv_info, columns=key_list)

df = pandas.concat([df, df1])

time.sleep(.5)

self.browser.back()

search_times += 1

df.to_csv('weibo_spider.csv', mode='a', encoding='utf_8_sig', header=False, index=False)

url = self.browser.current_url

page_num = url.split('page=')[-1]

if page_num.isdigit() is False:

page_num = '1'

page_num = int(page_num)

_time = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')

print(f'已成功提取第{page_num}页微博信息并追加写入CSV文件!当前时间是{_time},目前已抓取{search_times}条微博。')

if page_num == 50:

post = etree.HTML(self.browser.page_source)

time_last = post.xpath('//p[@class="from"]/a[1]/text()')

if len(time_last) == 0:

time_last = post.xpath('//div[@class="from"]/a[1]/text()'

time_last = time_last[-1]

if '年' in time_last:

year_num = ''.join(re.findall(r'(\d+)', time_last)[0])

mon_num = ''.join(re.findall(r'(\d+)', time_last)[1])

day_num = ''.join(re.findall(r'(\d+)', time_last)[2])

hour_num = ''.join(re.findall(r'(\d+)', time_last)[3])

min_num = ''.join(re.findall(r'(\d+)', time_last)[4])

elif '今天' in time_last:

year_num = str(datetime.datetime.today().year)

mon_num = str(datetime.datetime.today().month)

day_num = str(datetime.datetime.today().day)

hour_num = ''.join(re.findall(r'(\d+)', time_last)[2])

min_num = ''.join(re.findall(r'(\d+)', time_last)[3])

else:

year_num = str(datetime.datetime.today().year)

mon_num = ''.join(re.findall(r'(\d+)', time_last)[0])

day_num = ''.join(re.findall(r'(\d+)', time_last)[1])

hour_num = ''.join(re.findall(r'(\d+)', time_last)[2])

min_num = ''.join(re.findall(r'(\d+)', time_last)[3])

time_last = year_num + '-' + mon_num + '-' + day_num + ' ' + hour_num + ':' + min_num

time_last = datetime.datetime.strptime(time_last, '%Y-%m-%d %H:%M')

time_last = datetime.datetime.strftime(time_last, '%Y-%m-%d-%H')

date_format = '%Y-%m-%d-%H'

time_begin = (datetime.datetime.strptime(time_last, date_format) +

datetime.timedelta(days=-31)).strftime(date_format)

time_end = (datetime.datetime.strptime(time_last, date_format) +

datetime.timedelta(hours=+1)).strftime(date_format)

print(f'这是第50页,本页最后一条微博时间为{time_last}。当前时间是{_time},目前已抓取{search_times}条微博。准备跳转页面...')

url = self.browser.current_url

url_change = re.search(r'(.*)(?=q=)', url)

url = url_change.group() + f'q={self.keywords}{self.origin}&suball=1×cope=custom:{time_begin}:{time_end}&Refer=g&page=1'

return url, search_times

click_next = None

while True:

try:

click_next = self.browser.find_element(By.XPATH,

'//div[@class="m-page"]/div/a[@class="next"]')

break

except selenium.common.exceptions.NoSuchElementException as E:

print(repr(E))

next_time = (datetime.datetime.now() + (datetime.timedelta(seconds=+3601))).strftime('%Y-%m-%d %H:%M:%S')

_target_time = datetime.datetime.strptime(next_time, '%Y-%m-%d %H:%M:%S')

_time = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')

print(f'达到单时段最大次数限制。当前时间是{_time},目前已抓取{search_times}条微博,下次抓取时间:{next_time},现在睡眠中...')

while datetime.datetime.now() < _target_time:

time.sleep(60)

self.browser.back()

continue

click_next.click()

url = self.browser.current_url

return url, search_times

def main(self):

url, search_times = self.open_search()

while True:

url, search_times = self.auto_search(url, search_times)

if __name__ == '__main__':

gt = GetWeibo()

 

获取cookie,需要登录自己的微博账号,建议手机APP扫码:

from selenium import webdriver

from selenium.webdriver.chrome.options import Options

import json

import time

browser_options = Options()

browser = webdriver.Chrome(chrome_options=browser_options)

headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '

'AppleWebKit/537.36 (KHTML, like Gecko) Chrome'

'/103.0.0.0 Safari/537.36'}

print("浏览器已成功创建。")

def get_cookie(url='https://weibo.com/login.php'):

url = url

browser.get(url)

print('请在25秒内,使用微博APP扫码登录你的账号...')

time.sleep(25)

with open('cookies.txt', 'w') as f:

f.write(json.dumps(browser.get_cookies()))

f.close()

print('已成功保存cookie信息。')

if __name__ == '__main__':

get_cookie()

 

 

精彩链接

评论可见,请评论后查看内容,谢谢!!!
 您阅读本篇文章共花了: