Python微博爬虫,批量获取指定账号数据
2024-12-26 07:50
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
from datetime import datetime
import re
# 配置WebDriver的路径(确保chromedriver的路径正确)
chrome_driver_path = 'C:/Users/Administrator/Downloads/Compressed/chromedriver_win32/chromedriver.exe'
# 1.使用自己的微博账号模拟登录微博:有验证码时请手动点击验证,有二维码时请扫描
def login(username, password):
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service)
driver.maximize_window()
# 导航到目标网页
driver.get('https://weibo.com')
# 等待页面加载完成(根据需要调整等待时间)
time.sleep(15)
# 点击立即登录按钮
login_button = driver.find_element(By.XPATH,"//*[@id='__sidebar']/div/div[2]/div[1]/div/button")
login_button.click()
# 等待页面加载完成(根据需要调整等待时间)
time.sleep(2)
driver.switch_to.window(driver.window_handles[-1])
# 会弹出登录框
# 点击账号登录页签
driver.find_element(By.XPATH,"//*[@id='app']/div/div/div[2]/div[2]/ul/li[2]/a").click()
# 输入用户名和密码
driver.find_element(By.XPATH, '//*[@id="app"]/div/div/div[2]/div[2]/form/div[1]/input').send_keys(username)
driver.find_element(By.XPATH, '//*[@id="app"]/div/div/div[2]/div[2]/form/div[2]/input').send_keys(password)
# 点击登录按钮
driver.find_element(By.XPATH, '//*[@id="app"]/div/div/div[2]/div[2]/button').click()
# 等待页面加载完成(根据需要调整等待时间)
time.sleep(20)
# 2.登录之后,爬取微博内容
uid = '6282329711' # 用户id,在调试器获取https://weibo.com/ajax/statuses/mymblog?uid=6282329711&page=1&feature=0
full_content_url_template = 'https://weibo.com/ajax/statuses/longtext?id={}' # 爬取需要点击展开的内容模板url
url = 'https://weibo.com/ajax/statuses/mymblog?uid={}&page={}&feature=0' # 微博爬取一页内容模板url
referer_url = 'https://weibo.com/u/{}' # 关联url
# 需要登录后,从调试器获取
cookie = 'UOR=www.paperyy.com,service.weibo.com,www.paperyy.com; SINAGLOBAL=9732409442892.457.1685464393254; ULV=1716132809564:2:2:1:4208533399564.4937.1716132809555:1685464393294; XSRF-TOKEN=eYObo3SCebGWa1Qh0KjRJhpk; SUB=_2A25LTwvDDeRhGeBM41AS8ifLyj2IHXVoJQELrDV8PUNbmtANLVfskW9NRLDE4T7-ix-_0UDTlmMmTfpLSErD9P7D; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFbwUE27mTLonPIWJBKMPAR5JpX5KzhUgL.FoqE1hz0eo.NeK22dJLoIEXLxK-LBo5L12qLxKML12eLB-zLxKML1hnLBo2LxKMLB.eL1KqLxKMLBKnL12zt; ALF=02_1718814867; WBPSESS=Dt2hbAUaXfkVprjyrAZT_C-r5K0-gEJyRI6VaswXJvDnYyZUZqulbfZ3htR25AGrSlkEUHpYmh1Gvd7zzn4dRMsmbAxq5I8hMphxpySyYskzFcaidhCiqvoh75BgoHebQjgEUFzIGjBs7ilfB_4zx7zabbYROJG2BJvEHMRLNrEmsL1Ht7ajeqSVt6mybviShFHjkliWEa_wEa7ndBjICg=='
token = 'eYObo3SCebGWa1Qh0KjRJhpk' # 需要登录后,从调试器获取
headers = {
'referer': referer_url.format(uid),
'x-requested-with': 'XMLHttpRequest',
'x-xsrf-token': token,
'cookie': cookie,
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
def fetch_weibo_data():
content_list = []
time_list = []
source_list = []
attitudes_count_list = []
comments_count_list = []
reposts_count_list = []
num = 1
since_id = ''
while True:
try:
# 访问微博页面
this_url = url.format(uid, num)
if (since_id !=''):
this_url += '&since_id={}'.format(since_id)
print("访问页面:" + this_url)
response = requests.get(this_url, headers=headers, timeout=10)
response.encoding = 'utf-8'
content = response.json()
if (content['ok'] == 1):
# 解析微博数据
data = content['data']
print('total: ' + str(data['total']))
if (data['total'] == 0):
break
else:
since_id = str(data['since_id'])
for item in data['list']:
text = item['text']
if ('<span class="expand">展开</span>' in text):
text = get_full_content(item['mblogid'])
content_list.append(text)
time_list.append(format_datetime(item['created_at']))
source_list.append(get_source(item['source']))
attitudes_count_list.append(item['attitudes_count'])
comments_count_list.append(item['comments_count'])
reposts_count_list.append(item['reposts_count'])
num += 1
print('since_id: ' + str(data['since_id']))
visit_rum(this_url)
time.sleep(1)
else:
print("获取微博数据失败,请检查网络连接或微博账号是否正确")
except:
print("请求超时,正在重试...")
time.sleep(2)
# 保存微博数据到csv文件
weibodata = {
'微博内容': content_list,
'时间': time_list,
'来源': source_list,
'点赞数': attitudes_count_list,
'评论数': comments_count_list,
'转发数': reposts_count_list
}
df = pd.DataFrame(weibodata)
df.to_excel('weibo_{}_{}.xlsx'.format(uid,time.strftime('%Y-%m-%d %H_%M_%S', time.localtime())), index=False)
def visit_rum(url_name):
# 访问RUM页面
form_data = {
"name":url_name,
"entryType":"resource",
"responseStatus":200,
"serverTiming":[],
"dns":0,
"tcp":0,
"ttfb":433.90000000000873,
"pathname":"https://weibo.com/ajax/statuses/mymblog",
"speed":0
}
while True:
try:
rum_url = 'https://weibo.com/ajax/log/rum'
print("访问页面:" + rum_url)
response = requests.post(rum_url, headers=headers, data=form_data, timeout=10)
response.encoding = 'utf-8'
content = response.json()
if (content['ok'] == 1):
break
except:
print("请求超时,正在重试...")
time.sleep(2)
# 获取微博长文本内容
def get_full_content(id):
longcontent = ''
while True:
try:
print("访问页面:" + full_content_url_template.format(id))
response = requests.get(full_content_url_template.format(id), headers=headers, timeout=10)
response.encoding = 'utf-8'
content = response.json()
if (content['ok'] == 1):
data = content['data']
longcontent = data['longTextContent']
break
except:
print("请求超时,正在重试...")
time.sleep(2)
return longcontent
# 格式化微博发布时间
def format_datetime(datetime_str):
# 定义输入日期字符串的格式
input_format = "%a %b %d %H:%M:%S %z %Y"
# 定义输出日期字符串的格式
output_format = "%Y-%m-%d %H:%M:%S"
# 将字符串转换为datetime对象
date_obj = datetime.strptime(datetime_str, input_format)
# 将datetime对象转换为指定格式的字符串
formatted_date_str = date_obj.strftime(output_format)
return formatted_date_str
def get_source(src_text):
pattern =r'<a.*?>(.*?)</a>'
result = re.findall(pattern, src_text)
result_text = ''
if(len(result)==0):
result_text = src_text
else:
result_text = result[0]