Python微博爬虫，批量获取指定账号数据

Python微博爬虫，批量获取指定账号数据

2024-12-26 07:50

import requests from bs4 import BeautifulSoup import pandas as pd from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By import time from datetime import datetime import re # 配置WebDriver的路径（确保chromedriver的路径正确） chrome_driver_path = 'C:/Users/Administrator/Downloads/Compressed/chromedriver_win32/chromedriver.exe' # 1.使用自己的微博账号模拟登录微博：有验证码时请手动点击验证,有二维码时请扫描 def login(username, password): service = Service(chrome_driver_path) driver = webdriver.Chrome(service=service) driver.maximize_window() # 导航到目标网页 driver.get('https://weibo.com') # 等待页面加载完成（根据需要调整等待时间） time.sleep(15) # 点击立即登录按钮 login_button = driver.find_element(By.XPATH,"//*[@id='__sidebar']/div/div[2]/div[1]/div/button") login_button.click() # 等待页面加载完成（根据需要调整等待时间） time.sleep(2) driver.switch_to.window(driver.window_handles[-1]) # 会弹出登录框 # 点击账号登录页签 driver.find_element(By.XPATH,"//*[@id='app']/div/div/div[2]/div[2]/ul/li[2]/a").click() # 输入用户名和密码 driver.find_element(By.XPATH, '//*[@id="app"]/div/div/div[2]/div[2]/form/div[1]/input').send_keys(username) driver.find_element(By.XPATH, '//*[@id="app"]/div/div/div[2]/div[2]/form/div[2]/input').send_keys(password) # 点击登录按钮 driver.find_element(By.XPATH, '//*[@id="app"]/div/div/div[2]/div[2]/button').click() # 等待页面加载完成（根据需要调整等待时间） time.sleep(20) # 2.登录之后，爬取微博内容 uid = '6282329711' # 用户id，在调试器获取https://weibo.com/ajax/statuses/mymblog?uid=6282329711&page=1&feature=0 full_content_url_template = 'https://weibo.com/ajax/statuses/longtext?id={}' # 爬取需要点击展开的内容模板url url = 'https://weibo.com/ajax/statuses/mymblog?uid={}&page={}&feature=0' # 微博爬取一页内容模板url referer_url = 'https://weibo.com/u/{}' # 关联url # 需要登录后，从调试器获取 cookie = 'UOR=www.paperyy.com,service.weibo.com,www.paperyy.com; SINAGLOBAL=9732409442892.457.1685464393254; ULV=1716132809564:2:2:1:4208533399564.4937.1716132809555:1685464393294; XSRF-TOKEN=eYObo3SCebGWa1Qh0KjRJhpk; SUB=_2A25LTwvDDeRhGeBM41AS8ifLyj2IHXVoJQELrDV8PUNbmtANLVfskW9NRLDE4T7-ix-_0UDTlmMmTfpLSErD9P7D; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFbwUE27mTLonPIWJBKMPAR5JpX5KzhUgL.FoqE1hz0eo.NeK22dJLoIEXLxK-LBo5L12qLxKML12eLB-zLxKML1hnLBo2LxKMLB.eL1KqLxKMLBKnL12zt; ALF=02_1718814867; WBPSESS=Dt2hbAUaXfkVprjyrAZT_C-r5K0-gEJyRI6VaswXJvDnYyZUZqulbfZ3htR25AGrSlkEUHpYmh1Gvd7zzn4dRMsmbAxq5I8hMphxpySyYskzFcaidhCiqvoh75BgoHebQjgEUFzIGjBs7ilfB_4zx7zabbYROJG2BJvEHMRLNrEmsL1Ht7ajeqSVt6mybviShFHjkliWEa_wEa7ndBjICg==' token = 'eYObo3SCebGWa1Qh0KjRJhpk' # 需要登录后，从调试器获取 headers = { 'referer': referer_url.format(uid), 'x-requested-with': 'XMLHttpRequest', 'x-xsrf-token': token, 'cookie': cookie, 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } def fetch_weibo_data(): content_list = [] time_list = [] source_list = [] attitudes_count_list = [] comments_count_list = [] reposts_count_list = [] num = 1 since_id = '' while True: try: # 访问微博页面 this_url = url.format(uid, num) if (since_id !=''): this_url += '&since_id={}'.format(since_id) print("访问页面：" + this_url) response = requests.get(this_url, headers=headers, timeout=10) response.encoding = 'utf-8' content = response.json() if (content['ok'] == 1): # 解析微博数据 data = content['data'] print('total: ' + str(data['total'])) if (data['total'] == 0): break else: since_id = str(data['since_id']) for item in data['list']: text = item['text'] if ('<span class="expand">展开</span>' in text): text = get_full_content(item['mblogid']) content_list.append(text) time_list.append(format_datetime(item['created_at'])) source_list.append(get_source(item['source'])) attitudes_count_list.append(item['attitudes_count']) comments_count_list.append(item['comments_count']) reposts_count_list.append(item['reposts_count']) num += 1 print('since_id: ' + str(data['since_id'])) visit_rum(this_url) time.sleep(1) else: print("获取微博数据失败，请检查网络连接或微博账号是否正确") except: print("请求超时，正在重试...") time.sleep(2) # 保存微博数据到csv文件 weibodata = { '微博内容': content_list, '时间': time_list, '来源': source_list, '点赞数': attitudes_count_list, '评论数': comments_count_list, '转发数': reposts_count_list } df = pd.DataFrame(weibodata) df.to_excel('weibo_{}_{}.xlsx'.format(uid,time.strftime('%Y-%m-%d %H_%M_%S', time.localtime())), index=False) def visit_rum(url_name): # 访问RUM页面 form_data = { "name":url_name, "entryType":"resource", "responseStatus":200, "serverTiming":[], "dns":0, "tcp":0, "ttfb":433.90000000000873, "pathname":"https://weibo.com/ajax/statuses/mymblog", "speed":0 } while True: try: rum_url = 'https://weibo.com/ajax/log/rum' print("访问页面：" + rum_url) response = requests.post(rum_url, headers=headers, data=form_data, timeout=10) response.encoding = 'utf-8' content = response.json() if (content['ok'] == 1): break except: print("请求超时，正在重试...") time.sleep(2) # 获取微博长文本内容 def get_full_content(id): longcontent = '' while True: try: print("访问页面：" + full_content_url_template.format(id)) response = requests.get(full_content_url_template.format(id), headers=headers, timeout=10) response.encoding = 'utf-8' content = response.json() if (content['ok'] == 1): data = content['data'] longcontent = data['longTextContent'] break except: print("请求超时，正在重试...") time.sleep(2) return longcontent # 格式化微博发布时间 def format_datetime(datetime_str): # 定义输入日期字符串的格式 input_format = "%a %b %d %H:%M:%S %z %Y" # 定义输出日期字符串的格式 output_format = "%Y-%m-%d %H:%M:%S" # 将字符串转换为datetime对象 date_obj = datetime.strptime(datetime_str, input_format) # 将datetime对象转换为指定格式的字符串 formatted_date_str = date_obj.strftime(output_format) return formatted_date_str def get_source(src_text): pattern =r'<a.*?>(.*?)</a>' result = re.findall(pattern, src_text) result_text = '' if(len(result)==0): result_text = src_text else: result_text = result[0]