Python微博爬虫,批量获取指定账号数据
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
from datetime import datetime
import re
# 配置WebDriver的路径(确保chromedriver的路径正确)
chrome_driver_path = 'C:/Users/Administrator/Downloads/Compressed/chromedriver_win32/chromedriver.exe'
# 1.使用自己的微博账号模拟登录微博:有验证码时请手动点击验证,有二维码时请扫描
def login(username, password):
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service)
driver.maximize_window()
# 导航到目标网页
driver.get('https://weibo.com')
# 等待页面加载完成(根据需要调整等待时间)
time.sleep(15)
# 点击立即登录按钮
login_button = driver.find_element(By.XPATH,"//*[@id='__sidebar']/div/div[2]/div[1]/div/button")
login_button.click()
# 等待页面加载完成(根据需要调整等待时间)
time.sleep(2)
driver.switch_to.window(driver.window_handles[-1])
# 会弹出登录框
# 点击账号登录页签
driver.find_element(By.XPATH,"//*[@id='app']/div/div/div[2]/div[2]/ul/li[2]/a").click()
# 输入用户名和密码
driver.find_element(By.XPATH, '//*[@id="app"]/div/div/div[2]/div[2]/form/div[1]/input').send_keys(username)
driver.find_element(By.XPATH, '//*[@id="app"]/div/div/div[2]/div[2]/form/div[2]/input').send_keys(password)
# 点击登录按钮
driver.find_element(By.XPATH, '//*[@id="app"]/div/div/div[2]/div[2]/button').click()
# 等待页面加载完成(根据需要调整等待时间)
time.sleep(20)
# 2.登录之后,爬取微博内容
uid = '6282329711' # 用户id,在调试器获取https://weibo.com/ajax/statuses/mymblog?uid=6282329711&page=1&feature=0
full_content_url_template = 'https://weibo.com/ajax/statuses/longtext?id={}' # 爬取需要点击展开的内容模板url
url = 'https://weibo.com/ajax/statuses/mymblog?uid={}&page={}&feature=0' # 微博爬取一页内容模板url
referer_url = 'https://weibo.com/u/{}' # 关联url
# 需要登录后,从调试器获取
cookie = 'UOR=www.paperyy.com,service.weibo.com,www.paperyy.com; SINAGLOBAL=9732409442892.457.1685464393254; ULV=1716132809564:2:2:1:4208533399564.4937.1716132809555:1685464393294; XSRF-TOKEN=eYObo3SCebGWa1Qh0KjRJhpk; SUB=_2A25LTwvDDeRhGeBM41AS8ifLyj2IHXVoJQELrDV8PUNbmtANLVfskW9NRLDE4T7-ix-_0UDTlmMmTfpLSErD9P7D; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFbwUE27mTLonPIWJBKMPAR5JpX5KzhUgL.FoqE1hz0eo.NeK22dJLoIEXLxK-LBo5L12qLxKML12eLB-zLxKML1hnLBo2LxKMLB.eL1KqLxKMLBKnL12zt; ALF=02_1718814867; WBPSESS=Dt2hbAUaXfkVprjyrAZT_C-r5K0-gEJyRI6VaswXJvDnYyZUZqulbfZ3htR25AGrSlkEUHpYmh1Gvd7zzn4dRMsmbAxq5I8hMphxpySyYskzFcaidhCiqvoh75BgoHebQjgEUFzIGjBs7ilfB_4zx7zabbYROJG2BJvEHMRLNrEmsL1Ht7ajeqSVt6mybviShFHjkliWEa_wEa7ndBjICg=='
token = 'eYObo3SCebGWa1Qh0KjRJhpk' # 需要登录后,从调试器获取
headers = {
'referer': referer_url.format(uid),
'x-requested-with': 'XMLHttpRequest',
'x-xsrf-token': token,
'cookie': cookie,
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
def fetch_weibo_data():
content_list = []
time_list = []
source_list = []
attitudes_count_list = []
comments_count_list = []
reposts_count_list = []
num = 1
since_id = ''
while True:
try:
# 访问微博页面
this_url = url.format(uid, num)
if (since_id !=''):
this_url += '&since_id={}'.format(since_id)
print("访问页面:" + this_url)
response = requests.get(this_url, headers=headers, timeout=10)
response.encoding = 'utf-8'
content = response.json()
if (content['ok'] == 1):
# 解析微博数据
data = content['data']
print('total: ' + str(data['total']))
if (data['total'] == 0):
break
else:
since_id = str(data['since_id'])
for item in data['list']:
text = item['text']
if ('<span class="expand">展开</span>' in text):
text = get_full_content(item['mblogid'])
content_list.append(text)
time_list.append(format_datetime(item['created_at']))
source_list.append(get_source(item['source']))
attitudes_count_list.append(item['attitudes_count'])
comments_count_list.append(item['comments_count'])
reposts_count_list.append(item['reposts_count'])
num += 1
print('since_id: ' + str(data['since_id']))
visit_rum(this_url)
time.sleep(1)
else:
print("获取微博数据失败,请检查网络连接或微博账号是否正确")
except:
print("请求超时,正在重试...")
time.sleep(2)
# 保存微博数据到csv文件
weibodata = {
'微博内容': content_list,
'时间': time_list,
'来源': source_list,
'点赞数': attitudes_count_list,
'评论数': comments_count_list,
'转发数': reposts_count_list
}
df = pd.DataFrame(weibodata)
df.to_excel('weibo_{}_{}.xlsx'.format(uid,time.strftime('%Y-%m-%d %H_%M_%S', time.localtime())), index=False)
def visit_rum(url_name):
# 访问RUM页面
form_data = {
"name":url_name,
"entryType":"resource",
"responseStatus":200,
"serverTiming":[],
"dns":0,
"tcp":0,
"ttfb":433.90000000000873,
"pathname":"https://weibo.com/ajax/statuses/mymblog",
"speed":0
}
while True:
try:
rum_url = 'https://weibo.com/ajax/log/rum'
print("访问页面:" + rum_url)
response = requests.post(rum_url, headers=headers, data=form_data, timeout=10)
response.encoding = 'utf-8'
content = response.json()
if (content['ok'] == 1):
break
except:
print("请求超时,正在重试...")
time.sleep(2)
# 获取微博长文本内容
def get_full_content(id):
longcontent = ''
while True:
try:
print("访问页面:" + full_content_url_template.format(id))
response = requests.get(full_content_url_template.format(id), headers=headers, timeout=10)
response.encoding = 'utf-8'
content = response.json()
if (content['ok'] == 1):
data = content['data']
longcontent = data['longTextContent']
break
except:
print("请求超时,正在重试...")
time.sleep(2)
return longcontent
# 格式化微博发布时间
def format_datetime(datetime_str):
# 定义输入日期字符串的格式
input_format = "%a %b %d %H:%M:%S %z %Y"
# 定义输出日期字符串的格式
output_format = "%Y-%m-%d %H:%M:%S"
# 将字符串转换为datetime对象
date_obj = datetime.strptime(datetime_str, input_format)
# 将datetime对象转换为指定格式的字符串
formatted_date_str = date_obj.strftime(output_format)
return formatted_date_str
def get_source(src_text):
pattern =r'<a.*?>(.*?)</a>'
result = re.findall(pattern, src_text)
result_text = ''
if(len(result)==0):
result_text = src_text
else:
result_text = result[0]
特别提示:本信息由相关用户自行提供,真实性未证实,仅供参考。请谨慎采用,风险自负。