Python微博爬虫,批量获取指定账号数据

   日期:2024-12-26    作者:shqfhg6868 移动:http://oml01z.riyuangf.com/mobile/quote/32153.html
import requests from bs4 import BeautifulSoup import pandas as pd from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By import time from datetime import datetime import re # 配置WebDriver的路径(确保chromedriver的路径正确) chrome_driver_path = 'C:/Users/Administrator/Downloads/Compressed/chromedriver_win32/chromedriver.exe' # 1.使用自己的微博账号模拟登录微博:有验证码时请手动点击验证,有二维码时请扫描 def login(username, password): service = Service(chrome_driver_path) driver = webdriver.Chrome(service=service) driver.maximize_window() # 导航到目标网页 driver.get('https://weibo.com') # 等待页面加载完成(根据需要调整等待时间) time.sleep(15) # 点击立即登录按钮 login_button = driver.find_element(By.XPATH,"//*[@id='__sidebar']/div/div[2]/div[1]/div/button") login_button.click() # 等待页面加载完成(根据需要调整等待时间) time.sleep(2) driver.switch_to.window(driver.window_handles[-1]) # 会弹出登录框 # 点击账号登录页签 driver.find_element(By.XPATH,"//*[@id='app']/div/div/div[2]/div[2]/ul/li[2]/a").click() # 输入用户名和密码 driver.find_element(By.XPATH, '//*[@id="app"]/div/div/div[2]/div[2]/form/div[1]/input').send_keys(username) driver.find_element(By.XPATH, '//*[@id="app"]/div/div/div[2]/div[2]/form/div[2]/input').send_keys(password) # 点击登录按钮 driver.find_element(By.XPATH, '//*[@id="app"]/div/div/div[2]/div[2]/button').click() # 等待页面加载完成(根据需要调整等待时间) time.sleep(20) # 2.登录之后,爬取微博内容 uid = '6282329711' # 用户id,在调试器获取https://weibo.com/ajax/statuses/mymblog?uid=6282329711&page=1&feature=0 full_content_url_template = 'https://weibo.com/ajax/statuses/longtext?id={}' # 爬取需要点击展开的内容模板url url = 'https://weibo.com/ajax/statuses/mymblog?uid={}&page={}&feature=0' # 微博爬取一页内容模板url referer_url = 'https://weibo.com/u/{}' # 关联url # 需要登录后,从调试器获取 cookie = 'UOR=www.paperyy.com,service.weibo.com,www.paperyy.com; SINAGLOBAL=9732409442892.457.1685464393254; ULV=1716132809564:2:2:1:4208533399564.4937.1716132809555:1685464393294; XSRF-TOKEN=eYObo3SCebGWa1Qh0KjRJhpk; SUB=_2A25LTwvDDeRhGeBM41AS8ifLyj2IHXVoJQELrDV8PUNbmtANLVfskW9NRLDE4T7-ix-_0UDTlmMmTfpLSErD9P7D; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFbwUE27mTLonPIWJBKMPAR5JpX5KzhUgL.FoqE1hz0eo.NeK22dJLoIEXLxK-LBo5L12qLxKML12eLB-zLxKML1hnLBo2LxKMLB.eL1KqLxKMLBKnL12zt; ALF=02_1718814867; WBPSESS=Dt2hbAUaXfkVprjyrAZT_C-r5K0-gEJyRI6VaswXJvDnYyZUZqulbfZ3htR25AGrSlkEUHpYmh1Gvd7zzn4dRMsmbAxq5I8hMphxpySyYskzFcaidhCiqvoh75BgoHebQjgEUFzIGjBs7ilfB_4zx7zabbYROJG2BJvEHMRLNrEmsL1Ht7ajeqSVt6mybviShFHjkliWEa_wEa7ndBjICg==' token = 'eYObo3SCebGWa1Qh0KjRJhpk' # 需要登录后,从调试器获取 headers = { 'referer': referer_url.format(uid), 'x-requested-with': 'XMLHttpRequest', 'x-xsrf-token': token, 'cookie': cookie, 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } def fetch_weibo_data(): content_list = [] time_list = [] source_list = [] attitudes_count_list = [] comments_count_list = [] reposts_count_list = [] num = 1 since_id = '' while True: try: # 访问微博页面 this_url = url.format(uid, num) if (since_id !=''): this_url += '&since_id={}'.format(since_id) print("访问页面:" + this_url) response = requests.get(this_url, headers=headers, timeout=10) response.encoding = 'utf-8' content = response.json() if (content['ok'] == 1): # 解析微博数据 data = content['data'] print('total: ' + str(data['total'])) if (data['total'] == 0): break else: since_id = str(data['since_id']) for item in data['list']: text = item['text'] if ('<span class="expand">展开</span>' in text): text = get_full_content(item['mblogid']) content_list.append(text) time_list.append(format_datetime(item['created_at'])) source_list.append(get_source(item['source'])) attitudes_count_list.append(item['attitudes_count']) comments_count_list.append(item['comments_count']) reposts_count_list.append(item['reposts_count']) num += 1 print('since_id: ' + str(data['since_id'])) visit_rum(this_url) time.sleep(1) else: print("获取微博数据失败,请检查网络连接或微博账号是否正确") except: print("请求超时,正在重试...") time.sleep(2) # 保存微博数据到csv文件 weibodata = { '微博内容': content_list, '时间': time_list, '来源': source_list, '点赞数': attitudes_count_list, '评论数': comments_count_list, '转发数': reposts_count_list } df = pd.DataFrame(weibodata) df.to_excel('weibo_{}_{}.xlsx'.format(uid,time.strftime('%Y-%m-%d %H_%M_%S', time.localtime())), index=False) def visit_rum(url_name): # 访问RUM页面 form_data = { "name":url_name, "entryType":"resource", "responseStatus":200, "serverTiming":[], "dns":0, "tcp":0, "ttfb":433.90000000000873, "pathname":"https://weibo.com/ajax/statuses/mymblog", "speed":0 } while True: try: rum_url = 'https://weibo.com/ajax/log/rum' print("访问页面:" + rum_url) response = requests.post(rum_url, headers=headers, data=form_data, timeout=10) response.encoding = 'utf-8' content = response.json() if (content['ok'] == 1): break except: print("请求超时,正在重试...") time.sleep(2) # 获取微博长文本内容 def get_full_content(id): longcontent = '' while True: try: print("访问页面:" + full_content_url_template.format(id)) response = requests.get(full_content_url_template.format(id), headers=headers, timeout=10) response.encoding = 'utf-8' content = response.json() if (content['ok'] == 1): data = content['data'] longcontent = data['longTextContent'] break except: print("请求超时,正在重试...") time.sleep(2) return longcontent # 格式化微博发布时间 def format_datetime(datetime_str): # 定义输入日期字符串的格式 input_format = "%a %b %d %H:%M:%S %z %Y" # 定义输出日期字符串的格式 output_format = "%Y-%m-%d %H:%M:%S" # 将字符串转换为datetime对象 date_obj = datetime.strptime(datetime_str, input_format) # 将datetime对象转换为指定格式的字符串 formatted_date_str = date_obj.strftime(output_format) return formatted_date_str def get_source(src_text): pattern =r'<a.*?>(.*?)</a>' result = re.findall(pattern, src_text) result_text = '' if(len(result)==0): result_text = src_text else: result_text = result[0]

特别提示:本信息由相关用户自行提供,真实性未证实,仅供参考。请谨慎采用,风险自负。


举报收藏 0评论 0
0相关评论
相关最新动态
推荐最新动态
点击排行
{
网站首页  |  关于我们  |  联系方式  |  使用协议  |  隐私政策  |  版权隐私  |  网站地图  |  排名推广  |  广告服务  |  积分换礼  |  网站留言  |  RSS订阅  |  违规举报  |  鄂ICP备2020018471号