How to Scrape Maoyan Movie Data and Visualize Trends with Python
This tutorial walks you through collecting movie information from Maoyan using Python web‑scraping, storing the results in CSV, and then applying pandas, matplotlib, and WordCloud to analyze and visualize trends such as release years, genres, regions, durations, and ratings across China and the world.
Tools Preparation
Data source: https://maoyan.com/board/4?offset=1
Development environment: Windows 10, Python 3.7
IDE and browser: PyCharm, Chrome
Project Idea
The goal is to crawl all movie information from Maoyan, using the top‑100 list as an example, and extract fields such as movie name, rating, link, genre, release location, duration, and release year.
Scraping Code
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2021年06月05日
# @File : demo4.py
import requests
from fake_useragent import UserAgent
from lxml import etree
import time
# Random request header
ua = UserAgent()
# Request headers (may need to be updated manually)
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Cookie': '__mta=244176442.1622872454168.1622876903037.1622877097390.7; uuid_n_v=v1; uuid=6FFF6D30C5C211EB8D61CF53B1EFE83FE91D3C40EE5240DCBA0A422050B1E8C0; _csrf=bff9b813020b795594ff3b2ea3c1be6295b7453d19ecd72f8beb9700c679dfb4; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1622872443; _lxsdk_cuid=1770e9ed136c8-048c356e76a22b-7d677965-1fa400-1770e9ed136c8; _lxsdk=6FFF6D30C5C211EB8D61CF53B1EFE83FE91D3C40EE5240DCBA0A422050B1E8C0; ci=59; recentCis=59',
'User-Agent': str(ua.random)
}
def RequestsTools(url):
"""Crawler request utility function.
:param url: request URL
:return: parsed HTML object for xpath extraction"""
response = requests.get(url, headers=headers).content.decode('utf-8')
html = etree.HTML(response)
return html
def Index(page):
"""Process the index page.
:param page: offset value"""
url = f'https://maoyan.com/board/4?offset={page}'
html = RequestsTools(url)
urls_text = html.xpath('//a[@class="image-link"]/@href')
pingfen1 = html.xpath('//i[@class="integer"]/text()')
pingfen2 = html.xpath('//i[@class="fraction"]/text()')
for i, p1, p2 in zip(urls_text, pingfen1, pingfen2):
pingfen = p1 + p2
urs = 'https://maoyan.com' + i
time.sleep(2)
Details(urs, pingfen)
def Details(url, pingfen):
html = RequestsTools(url)
dianyan = html.xpath('//h1[@class="name"]/text()') # movie name
leixing = html.xpath('//li[@class="ellipsis"]/a/text()') # genre
diqu = html.xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[2]/text()') # region
timedata = html.xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[3]/text()') # release year
for d, l, b, t in zip(dianyan, leixing, diqu, timedata):
countyr = b.replace('
', '').split('/')[0] # region
shichang = b.replace('
', '').split('/')[1] # duration
with open('猫眼.csv', 'a', encoding='utf-8') as f:
f.write(f"{d}, {pingfen}, {url}, {l}, {countyr}, {shichang}, {t}
")
print(d, pingfen, url, l, countyr, shichang, t)
for page in range(0, 11):
page *= 10
Index(page)Data Visualization Tools
import pandas as pd
import numpy as np
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# %matplotlib inlineVisualization Code
# Load CSV
path = './maoyan.csv'
df = pd.read_csv(path, sep=',', encoding='utf-8')
df.drop(df.columns[0], axis=1, inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
# Plot number of movies per release year (before 2018)
fig, ax = plt.subplots(figsize=(9, 6), dpi=70)
(df[df['上映时间'] < 2018]['上映时间'].value_counts().sort_index()
.plot(kind='line', ax=ax))
ax.set_xlabel('Year')
ax.set_ylabel('Number of Movies')
ax.set_title('Movies Released per Year')
# Relationship between year, count, and average rating
x = df[df['上映时间'] < 2018]['上映时间'].value_counts().sort_index().index
y = df[df['上映时间'] < 2018]['上映时间'].value_counts().sort_index().values
y2 = (df[df['上映时间'] < 2018]
.sort_values(by='上映时间')
.groupby('上映时间')
.mean()['评分'].values)
fig, ax = plt.subplots(figsize=(10, 5), dpi=70)
ax.plot(x, y, label='Count')
ax.set_xlim(1980, 2017)
ax.set_xlabel('Year')
ax.set_ylabel('Count')
ax.set_title('Year, Count & Average Rating')
ax2 = ax.twinx()
ax2.plot(x, y2, c='y', ls='--', label='Rating')
ax.legend(loc=1)
ax2.legend(loc=2)
# World average rating over years
fig, ax = plt.subplots(figsize=(10, 7), dpi=60)
(df[df['评分'] > 0]
.groupby('上映时间')
.mean()['评分']
.plot(kind='line', ax=ax))
ax.set_ylabel('Rating')
ax.set_title('World Average Rating by Year')
# Genre distribution
types = []
for tp in df['类型']:
for g in tp.split(','):
types.append(g)
genre_df = pd.DataFrame({'类型': types})
fig, ax = plt.subplots(figsize=(9, 6), dpi=60)
genre_df['类型'].value_counts().plot(kind='bar', ax=ax)
ax.set_xlabel('Genre')
ax.set_ylabel('Count')
ax.set_title('Genre Distribution')
# Duration vs Rating scatter plot
valid = df[df['评分'] > 0]
fig, ax = plt.subplots(figsize=(9, 6), dpi=70)
ax.scatter(valid['时长(min)'], valid['评分'], alpha=0.6, marker='o')
ax.set_xlabel('Duration (min)')
ax.set_ylabel('Rating')
ax.set_title('Duration vs Rating')
# China vs World comparison (1980‑2017)
china_df = df[df['地区'].str.contains('中国大陆')]
fig, ax = plt.subplots(figsize=(12, 9), dpi=60)
ax.plot(x, y1, ls='-', c='DarkTurquoise', label='World')
ax.plot(x, y2, ls='--', c='Gold', label='China')
ax.set_title('China vs World Average Rating')
ax.set_xlabel('Year')
ax.set_xlim(1980, 2017)
ax.set_ylabel('Rating')
ax.legend()
# Word cloud of movie titles
wl = ",".join(df['电影'][:15].values)
wc = WordCloud(background_color='white', font_path='C:\\Windows\\Fonts\\simkai.ttf', max_font_size=60, random_state=30)
myword = wc.generate(wl)
wc.to_file('result.jpg')
plt.imshow(myword)
plt.axis('off')
plt.show()Result Images
The generated plots illustrate release trends, genre counts, duration‑rating distributions, and a comparison between Chinese and global movie ratings.
Signed-in readers can open the original source through BestHub's protected redirect.
This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.
MaGe Linux Operations
Founded in 2009, MaGe Education is a top Chinese high‑end IT training brand. Its graduates earn 12K+ RMB salaries, and the school has trained tens of thousands of students. It offers high‑pay courses in Linux cloud operations, Python full‑stack, automation, data analysis, AI, and Go high‑concurrency architecture. Thanks to quality courses and a solid reputation, it has talent partnerships with numerous internet firms.
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
