How to Scrape Weibo Data with Python: Complete Guide & Code
This tutorial walks through using Python to crawl Weibo, covering environment setup, three login methods, data extraction functions for user info, posts and comments, anti‑crawling strategies, storage to CSV or MySQL, a full example script, and legal considerations.
Introduction
Weibo is a major Chinese social media platform with valuable public data. This article explains how to crawl Weibo data using Python, covering environment setup, login simulation, data fetching, anti‑crawling measures, and data storage, with runnable code examples.
Environment Preparation
Required Packages
pip install requests selenium beautifulsoup4 pandas pyquery pymysqlrequests: send HTTP requests; selenium: simulate browser actions; beautifulsoup4/pyquery: HTML parsing; pandas: data processing; pymysql: MySQL storage.
Recommended Development Environment
Python 3.8+, Chrome browser + ChromeDriver, optional MySQL 5.7+.
Weibo Login Simulation
Method 1: Cookie Login (simplest)
import requests
cookies = {'SUB': 'your_SUB', 'SUHB': 'your_SUHB'}
response = requests.get('https://weibo.com', cookies=cookies)
print(response.status_code) # 200 means successMethod 2: Selenium Login (more stable)
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.get('https://weibo.com/login.php')
driver.find_element_by_id('loginname').send_keys('your_account')
driver.find_element_by_name('password').send_keys('your_password')
driver.find_element_by_xpath('//a[@node-type="submitBtn"]').click()
time.sleep(5)
cookies = driver.get_cookies()
print(cookies)
driver.quit()Method 3: API Login (advanced)
import requests
session = requests.Session()
login_url = 'https://passport.weibo.cn/sso/login'
data = {
'username': 'your_account',
'password': 'encrypted_password',
'savestate': '1',
'r': 'https://weibo.cn/',
'ec': '0',
'pagerefer': '',
'entry': 'mweibo',
'mainpageflag': '1'
}
headers = {
'Referer': 'https://passport.weibo.cn/signin/login',
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X)'
}
response = session.post(login_url, data=data, headers=headers)
print(response.json())Data Crawling
Fetch User Information
def get_user_info(user_id, cookies):
url = f'https://weibo.com/ajax/profile/info?uid={user_id}'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
'X-Requested-With': 'XMLHttpRequest'
}
response = requests.get(url, headers=headers, cookies=cookies)
if response.status_code == 200:
data = response.json()
user_info = {
'id': data['data']['user']['id'],
'screen_name': data['data']['user']['screen_name'],
'gender': data['data']['user']['gender'],
'location': data['data']['user']['location'],
'description': data['data']['user']['description'],
'followers_count': data['data']['user']['followers_count'],
'friends_count': data['data']['user']['friends_count'],
'statuses_count': data['data']['user']['statuses_count'],
'verified': data['data']['user']['verified']
}
return user_info
else:
print(f"Request failed, status code: {response.status_code}")
return NoneFetch User Weibo List (with pagination)
def get_weibo_list(user_id, page=1, cookies=None):
url = f'https://weibo.com/ajax/statuses/mymblog?uid={user_id}&page={page}'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
'X-Requested-With': 'XMLHttpRequest'
}
response = requests.get(url, headers=headers, cookies=cookies)
if response.status_code == 200:
data = response.json()
weibo_list = []
for item in data['data']['list']:
weibo_info = {
'id': item['id'],
'created_at': item['created_at'],
'text': item['text_raw'],
'reposts_count': item['reposts_count'],
'comments_count': item['comments_count'],
'attitudes_count': item['attitudes_count'],
'pics': [pic['url'] for pic in item.get('pics', [])]
}
weibo_list.append(weibo_info)
return weibo_list
else:
print(f"Request failed, status code: {response.status_code}")
return []Fetch Comments
def get_weibo_comments(weibo_id, page=1, cookies=None):
url = f'https://weibo.com/ajax/statuses/buildComments?flow=0&id={weibo_id}&page={page}'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
'X-Requested-With': 'XMLHttpRequest'
}
response = requests.get(url, headers=headers, cookies=cookies)
if response.status_code == 200:
data = response.json()
comments = []
for item in data['data']:
comment = {
'id': item['id'],
'text': item['text_raw'],
'user': item['user']['screen_name'],
'user_id': item['user']['id'],
'created_at': item['created_at'],
'like_count': item['like_count']
}
comments.append(comment)
return comments
else:
print(f"Request failed, status code: {response.status_code}")
return []Anti‑Crawling Strategies
Common Measures
Rate limiting per IP
CAPTCHA challenges
User‑Agent detection
Cookie expiration
Mitigation Techniques
import random, time
proxies = [{'http': 'http://123.123.123.123:8888'}, {'http': 'http://124.124.124.124:8888'}]
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X)'
]
def random_delay():
time.sleep(random.uniform(1, 3))
def safe_request(url, cookies):
headers = {'User-Agent': random.choice(user_agents)}
proxy = random.choice(proxies)
random_delay()
try:
return requests.get(url, headers=headers, cookies=cookies, proxies=proxy)
except Exception as e:
print(f"Request exception: {e}")
return NoneData Storage
CSV
import pandas as pd
def save_to_csv(data, filename):
df = pd.DataFrame(data)
df.to_csv(filename, index=False, encoding='utf_8_sig')MySQL
import pymysql
def save_to_mysql(data, table_name):
connection = pymysql.connect(host='localhost', user='root', password='123456',
database='weibo_data', charset='utf8mb4')
try:
with connection.cursor() as cursor:
keys = ', '.join(data[0].keys())
placeholders = ', '.join(['%s'] * len(data[0]))
sql = f"INSERT INTO {table_name} ({keys}) VALUES ({placeholders})"
cursor.executemany(sql, [tuple(item.values()) for item in data])
connection.commit()
finally:
connection.close()Full Example
# Configure cookies
WEIBO_COOKIES = {'SUB': 'your_SUB', 'SUHB': 'your_SUHB'}
# 1. Get user info
user_id = '1669879400' # Example: People's Daily
user_info = get_user_info(user_id, WEIBO_COOKIES)
print(user_info)
# 2. Get first 5 pages of posts
all_weibos = []
for page in range(1, 6):
weibos = get_weibo_list(user_id, page, WEIBO_COOKIES)
all_weibos.extend(weibos)
print(f"Fetched page {page}, {len(weibos)} posts")
time.sleep(2)
# 3. Save to CSV
save_to_csv(all_weibos, 'weibo_content.csv')
# 4. Get comments of the first post
if all_weibos:
first_weibo_id = all_weibos[0]['id']
comments = get_weibo_comments(first_weibo_id, 1, WEIBO_COOKIES)
save_to_csv(comments, 'weibo_comments.csv')Legal and Ethical Notes
Respect robots.txt and platform crawling policies.
Throttle requests (e.g., ≥3 seconds) to avoid overloading servers.
Use data only for personal learning or research; do not store sensitive personal information.
Commercial use requires official API authorization.
FAQ
Why no data? Possible reasons: expired cookies, high request rate, IP ban. Try new cookies, proxy IPs, increase delay.
How to get older posts? Web version shows limited history; use mobile API or paid API for older data.
What if a CAPTCHA appears? Options: manual entry, third‑party captcha solving services, or lower request frequency.
How to fix garbled characters? Ensure UTF‑8 encoding; use utf_8_sig for CSV and utf8mb4 charset for MySQL.
Signed-in readers can open the original source through BestHub's protected redirect.
This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.
Python Programming Learning Circle
A global community of Chinese Python developers offering technical articles, columns, original video tutorials, and problem sets. Topics include web full‑stack development, web scraping, data analysis, natural language processing, image processing, machine learning, automated testing, DevOps automation, and big data.
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
