黄油网站爬虫,爬取32万条视频数据
1.经过测试发现视频信息接口每页有100条数据,并且在每次用户访问时通过js更新,因此首先通过selenuim执行javascriptf,并且用户在两秒内多次访问接口会受到限制
def drivergo():
driver = None
system = platform.system()
if system == 'Windows':
executable_path = '../driver/chrome/chromedriver.exe'
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在的报错
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
options.add_argument('--headless') # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
driver = webdriver.Chrome(executable_path=executable_path, options=options)
elif system == 'Linux':
executable_path = '/www/task/driver/chromedriver'
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在的报错
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
options.add_argument('--headless') # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
driver = webdriver.Chrome(executable_path=executable_path, options=options)
else:
sys.exit(0)
return driver
2.因视频数据每日会更新,所以需要获取共有多少页数据
def Getpage(session):
try:
host = '因网站特殊性,故不贴出网址'
date1 = str(time.strftime('%Y-%m-%d', time.localtime()))
date2 = str(time.strftime('%Y-%m-%d_%H', time.localtime()))
apiurl = host + 'static/videocache/' + date1 + '/videolist_' + date2 + '_-_-_-_100_1.json'
info = session.get(apiurl, timeout=5, verify=False).json()
endpage = info['last_page']
except:
endpage = 3209
return endpage
3.因https长时间多次连接,出现ssl错,故关闭认证verify=False,并开启3次重试
urllib3.disable_warnings()
nopage = []
s = requests.session()
s.mount('http://', HTTPAdapter(max_retries=3))
s.mount('https://', HTTPAdapter(max_retries=3))
4.将获取到的数据,解析并保存到数据库
# 连接数据库
def Coonect():
conn = pymysql.connect(
host='localhost',
user='task',
password='123456',
db='task',
charset='utf8',
autocommit=True, # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
cursorclass=pymysql.cursors.DictCursor # 返回时获取字段值
)
return conn
# 保存视频信息
def Insert(videoinfo):
conn = Coonect()
cur = conn.cursor()
insert_sql = "insert ignore into yellovideo(videoid, videotitle, videostatus, videothumb, videopreview, videopanorama, videodescription, videourl, videocomefrom, videotags) values" + videoinfo + ";"
cur.execute(insert_sql)
cur.close()
conn.close()
def Process(info):
videosinfo = None
for video in info['data']:
# print(video)
videoid = video['id'] # 视频id
videotitle = video['title'] # 视频名称
videostatus = video['status'] # 状态
videothumb = video['thumb'] # 不知道是什么,可能是封面
videopreview = video['preview'] # 预览图,动图
videopanorama = video['panorama'] # 预览图,截图
videodescription = video['description'] # 描述,说明
videourl = video['video_url'] # 视频链接
videocomefrom = video['comefrom'] # 视频来源
videotags = video['tags'] # 视频标签
videoinfo = '(' + str(videoid) + ',"' + str(videotitle.replace("\"", "\'").replace("\\", "\\\\")) + '",' + str(
videostatus) + ',"' + str(videothumb) + '","' + str(videopreview) + '","' + str(
videopanorama) + '","' + str(videodescription) + '","' + str(videourl) + '",' + str(
videocomefrom) + ',"' + str(videotags) + '")'
if videosinfo:
videosinfo += ',' + videoinfo
else:
videosinfo = videoinfo
Insert(videosinfo) #保存到数据库
5.运行截图