import json
import os
import re
import threading
import time
import openpyxl
from DrissionPage import ChromiumOptions, ChromiumPage
# 创建一个excel文件
def create_excel(file_name):
# 实例化工作簿对象
workbook = openpyxl.Workbook()
# 激活当前工作表
worksheet = workbook.active
# 定义表头列表
title = ['标题', '价格', '店铺', '地区', '销量']
# 将表头数据写入第一行
worksheet.append(title)
# 保存创建好的excel文件
workbook.save(f'{file_name}.xlsx')
# 将抓取到的数据写入到excel文件中
def write_excel(name, shop_info):
# 打开 Excel 文件
workbook = openpyxl.load_workbook(f'{name}.xlsx')
# 获取默认的工作表对象
worksheet = workbook.active
# 将商品数据追加到excel表格的下一行
worksheet.append(shop_info)
# 保存 Excel 文件
workbook.save(f'{name}.xlsx')
def load_jsonp(text):
if isinstance(text, dict):
return {}
result = re.findall('^.*?\((.*?)\)$', text, re.S)
if result:
return json.loads(result[0])
def listen_data(shop_name, page):
for packet in page.listen.steps():
shops = load_jsonp(packet.response.body).get('data', {}).get('itemsArray', [])
for shop in shops:
data = [shop['title'], shop['price'], shop['shopInfo']['title'], shop['procity'], shop['realSales']]
print(data)
write_excel(shop_name, data)
def main():
if os.path.exists('user.ini'):
co2 = ChromiumOptions(ini_path='user.ini')
else:
co2 = ChromiumOptions().auto_port().set_user_data_path(f'user_data_{int(time.time() * 100)}')
co2.save('user.ini')
page = ChromiumPage(addr_or_opts=co2)
choice = input('请选择是否要重新登录(y/n):')
if choice == 'y':
page.get('https://login.taobao.com/member/login.jhtml')
shop_name = input('请输入爬取商品名称:')
create_excel(shop_name)
page.listen.start('wirelessrecommend.recommend/2.0')
listen_thread = threading.Thread(target=listen_data, args=(shop_name, page))
listen_thread.start()
page.get(f'https://s.taobao.com/search?page=1&q={shop_name}&tab=all')
for i in range(100):
page.scroll.down(800)
print('休眠5秒...')
time.sleep(5)
if __name__ == '__main__':
main()
因篇幅问题不能全部显示,请点此查看更多更全内容
Copyright © 2019- niushuan.com 版权所有 赣ICP备2024042780号-2
违法及侵权请联系:TEL:199 1889 7713 E-MAIL:2724546146@qq.com
本站由北京市万商天勤律师事务所王兴未律师提供法律服务