Scraping Eleme Store Comments with Python
This tutorial explains how to use Python on Windows to crawl any Eleme restaurant's comments—including user IDs, ratings, dates, text, and images—by analyzing network requests, constructing request URLs, retrieving JSON data, and storing the results in Excel or databases.
Goal: Retrieve all comment data from any Eleme store, including user ID, rating, comment date, text, images, and merchant replies.
Environment: Windows 7, Python 3.6, PyCharm 2019.
Scraping steps: link analysis, request construction, data retrieval, and data storage.
1. Link analysis: Use Chrome Network tools to capture the API endpoints after opening a store page and clicking the comments tab. The relevant endpoints start with batch_comments and ratings.
Example URLs:
# https://h5.ele.me/pizza/ugc/restaurants/E2096802567040327184/batch_comments?has_content=true&offset=0&limit=20 # https://h5.ele.me/restapi/ugc/v3/restaurants/E2096802567040327184/ratings?has_content=true&tag_name=全部&offset=20&limit=202. Request construction: Build request URLs using the store ID, offset, and limit. The script defines comment_url and ratings_url with string formatting.
3. Python implementation:
class ElemeComment:
""" Eleme comment crawler """
_file_path = './data/'
def __init__(self, file_name=''):
self.pwd = os.getcwd() # current path
self.file_name = file_name
self.all_proxy = AllProxy(pool_type='only_new')
self.shopid_queue = Queue()
self.get_area_from_xlsx(file_name)
self.save_queue = Queue()
self.data_all = 0
self.is_cookie_invalid = True
self.new_cookies = dict()
self.new_cookies_invalid = True
self.txt_cookies = GetCookiesFromTxt()
self.cookies = self.txt_cookies.get_cookies_with_cmd_input()
thread_list = []
get_menu_thread_num = 1
self.get_menu_thread_runningflag = [0] * get_menu_thread_num
for i in range(get_menu_thread_num):
t_goods = threading.Thread(target=self.get_comment_thread, args=(i, ''))
thread_list.append(t_goods)
t_save = threading.Thread(target=self.save_thread)
thread_list.append(t_save)
for t in thread_list:
t.setDaemon(True)
t.start()
def get_area_from_xlsx(self, file_name):
wb = load_workbook(file_name)
ws = wb.active
max_row = ws.max_row + 1
max_col = ws.max_column + 1
for row in range(1, max_row):
shop_id = [ws.cell(row, col).value for col in [1,5,6,7]]
self.shopid_queue.put(shop_id)
print(file_name, self.shopid_queue.qsize())
def requests_json(self, url, method="GET", cookies={}, headers={}, data=None, proxies={}):
try:
if method == "POST":
r = requests.post(url, data=data, headers=headers, cookies=cookies, proxies=proxies)
else:
r = requests.get(url, headers=headers, cookies=cookies, timeout=30, proxies=proxies)
return r.text
except:
return None
def get_cookies_wait(self):
time.sleep(1)
self.txt_cookies.delete_cookies()
self.cookies = self.txt_cookies.get_cookies_with_cmd_input()
def get_comment(self, ws, j):
comment_list = j['comments']
for comment in comment_list:
username = comment['username']
rating = comment['rating']
rated_at = comment['rated_at']
rating_text = comment['rating_text']
order_images = comment['order_images']
images_size = 0
image_str = ''
if order_images is not None:
images_size = len(order_images)
for img in order_images:
s = img['image_hash']
logoDict = {"s1": s[0:1], 's2': s[1:3], 's3': s[3:], 's4': 'png' if s[-3:] == 'png' else 'jpeg'}
logoUrl = 'https://fuss10.elemecdn.com/{s1}/{s2}/{s3}.{s4}?imageMogr/format/webp/thumbnail/!130x130r/gravity/Center/crop/130x130/'
image_path = logoUrl.format(**logoDict)
image_str += image_path + ';'
ws.append([username, rating, rated_at, rating_text, images_size, image_str])
return len(comment_list)
def get_comment_thread(self, index, proxy_type='pool'):
cookies_invalid_count = 0
proxy = self.all_proxy.get_proxy(proxy_type)
ip_port, proxies = proxy[0], proxy[1]
while True:
try:
if self.all_proxy is None or self.all_proxy.pool_enable:
self.get_menu_thread_runningflag[index] = 0
shopid_info = self.shopid_queue.get()
self.shopid_queue.task_done()
self.get_menu_thread_runningflag[index] = 1
shop_name = shopid_info[0]
shop_id = shopid_info[1]
area_lng = str(round(shopid_info[3], 6))
area_lat = str(round(shopid_info[2], 6))
x_shard = 'shopid=%s;loc=%s,%s' % (shop_id, area_lng, area_lat)
headers = {
'accept': 'application/json, text/plain, */*',
"Accept-Encoding": 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'referer': 'https://h5.ele.me/shop/',
'x-shard': x_shard,
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/79.0.3945.130 Mobile/13B143 Safari/601.1.46',
'x-ua': 'RenderWay/H5 AppName/wap',
'x-uab': '122#zIEYlJ58EEx2MEpZy4pjEJponDJE7SNEEP7ZpJRBuDPpJFQLpCGwoHZDpJEL7SwBEyGZpJLlu4Ep'
}
offset = 0
limit = 20
wb = Workbook()
ws = wb.active
ws.append(['userID', 'sorce', 'date', 'text', 'pic_num', 'picture'])
for _ in range(4):
comment_url = 'https://h5.ele.me/pizza/ugc/restaurants/%s/batch_comments?offset=%s&limit=%s' % (shop_id, offset, limit)
while True:
try:
res = self.requests_json(comment_url, cookies=self.cookies, headers=headers, proxies=proxies)
if res is None:
time.sleep(1)
continue
j = json.loads(res)
if j.get("rgv587_flag") == 'sm':
self.get_cookies_wait()
cookies_invalid_count = 0
continue
ret = j.get("ret")
RGV587_ERROR = '被挤爆啦,请稍后重试'
if ret is not None and RGV587_ERROR in str(ret):
self.get_cookies_wait()
cookies_invalid_count = 0
continue
jname = j.get("name")
if jname is not None:
if jname == 'UNAUTHORIZED':
self.get_cookies_wait()
cookies_invalid_count = 0
elif jname == 'NEED_SLIDE':
cookies_invalid_count += 1
if cookies_invalid_count >= 6:
self.get_cookies_wait()
cookies_invalid_count = 0
elif jname == 'INVALID_RESTAURANT_ID':
time.sleep(1)
break
comment_size = self.get_comment(ws, j)
time.sleep(random.randint(1, 3))
break
except Exception as err:
LOGGER.exception(err)
offset += 20
wb.save('./data/comment/' + shop_name + '.xlsx')
else:
time.sleep(1)
except Exception as err:
LOGGER.exception(err)
pass
def save_thread(self):
""" Save data to Excel files, splitting when too large """
file_name_basic = './data/menu/menu_%s' % get_unixtime_from_local()
file_index = 1
wb = Workbook()
ws = wb.active
ws_cnt = 0
ws_cnt_bak = 0
ws_cnt_max = 20000
file_name = file_name_basic + "_%s.xlsx" % file_index
file_index += 1
while True:
try:
time.sleep(10)
while True:
try:
data = self.save_queue.get_nowait()
self.save_queue.task_done()
ws.append(data)
ws_cnt += 1
except:
break
if ws_cnt != ws_cnt_bak:
wb.save(file_name)
ws_cnt_bak = ws_cnt
if ws_cnt >= ws_cnt_max:
wb = Workbook()
ws = wb.active
ws_cnt = 0
ws_cnt_bak = 0
file_name = file_name_basic + "_%s.xlsx" % file_index
file_index += 1
except Exception as err:
print(err)
time.sleep(1)
def eleme_comment():
file_name = './data/comment/shop_id.xlsx'
comment = ElemeComment(file_name=file_name)
while True:
time.sleep(10)
print('--- 剩余%s' % comment.shopid_queue.qsize())4. Data acquisition: The script iterates through pagination (offset/limit) to collect all comments, handling anti‑scraping responses such as slide verification or rate limiting, and writes each store's comments to a separate Excel workbook.
5. Data storage: Results can be saved as Excel files, or optionally inserted into MongoDB or MySQL databases for further analysis.
Signed-in readers can open the original source through BestHub's protected redirect.
This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.
Python Programming Learning Circle
A global community of Chinese Python developers offering technical articles, columns, original video tutorials, and problem sets. Topics include web full‑stack development, web scraping, data analysis, natural language processing, image processing, machine learning, automated testing, DevOps automation, and big data.
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
