python 爬虫
为什么会想到,用Python去爬取QQ空间相册呢,而且还要区分是照片,还是视频呢,就因太多了,这里回想当时,是多么想哭,还好,我是一个技术宅,还是个汇在通信圈里的技术宅,因此查资料,琢磨出来这大篇幅的脚本,今天代码出现了幺儿子,修正了代码,又可以执行了,现在分享给有需要的人,希望有人能关注,一块交流交流。
先简单说下,技术要点吧。
原理就是利用,python + selenium + chromedrive 模拟手机浏览器,获取相应的COOK,爬虫的一些简单的应用,下面是简单教程(这里是以你电脑已装了Py):
1)直接使用pip安装
pip3 install selenium
2)下载chromedriver 和Chrome浏览器,这里要注意的是,chromedriver 和chrome浏览器是有版本对应的,也就是也什么版本的chrome浏览器,对应着不同的chromedriver ,下面是最新的对应表,直接用就可以了。
ChromeDriver v74.0.3729.6 (2019-03-14)----------Supports Chrome v74下载地址:
这两样都下载了,然后,把 chromedriver放在相应的浏览器安装目录下,
好了,做好这两个步就完成了环境了,下面直接分享代码:
#!coding:utf-8 #ver:v2.1版本 from selenium import webdriver import requests,time,json,os,urllib,urllib3,logging #记录日志 logging.basicConfig(filename='exam;, filemode="w", level=logging.DEBUG) logging.debug('This message should go to the log file') logging.info('So should this') logging.warning('And this, too') global dlurl,dlpicname #设置下载后存放的存储路径' global path,dest_dir,dlpsnum dlpsnum=500 #下载分页数 path =r'E:\NONO' #下载的文件保存目录 #登陆信息 login_uin = 'XXXXXXXX' #登录qq pwd = 'XXXXXXXX' #登录密码 # input_pwd = input('请输入密码:') #登录密码 # pwd =input_pwd album_uin = 'XXXXXX' #要读取相册的qq s = reque() #实例化出浏览器开始登录 #设置手机型号 mobileEmulation = { "deviceName": "Nexus 5" } options = webdriver.ChromeOptions() o('mobileEmulation', mobileEmulation) #启动driver #以下假定你的Chrome安装目录为:"C:\Program Files (x86)\Google\Chrome\Application\c;,chrome_options=options) driver = webdriver.Chrome(executable_path=r"C:\Program Files (x86)\Google\Chrome\Application\c;,chrome_options=options) #访问 #driver = webdriver.Chrome("C:\Program Files (x86)\Google\Chrome\Application\c;) #driver.set_window_size(1000,600) driver.get(';) driver.find_element_by_id('u').clear() driver.find_element_by_id('u').send_keys(login_uin) driver.find_element_by_id('p').clear() driver.find_element_by_id('p').send_keys(pwd) driver.find_element_by_id('go').click() #等待浏览器中js计算出qzonetoken while True: qzonetoken = driver.execute_script("return window.shine0callback") if qzonetoken: break ) #读取cookie后关闭浏览器 cookies = driver.get_cookies() driver.quit() cookies_ = {} for cookie in cookies: if cookie['name'] == 'p_skey': skey = cookie['value'] #s.cookies.set(cookie['name'], cookie['value']) cookies_[cookie['name']] = cookie['value'] #计算gtk e = 5381 for i in range(len(skey)): e = e + (e<<5)+ord(skey[i]) g_tk = str(2147483647 & e) #请求中添加cookie,开始读取相册列表 reque, cookies_) url=";+qzonetoken+"&g_tk="+g_tk+"&format=json&list_type=album&action=0&res_uin="+album_uin+"&count=1" r = s.get(url); data = j('utf-8')) def getPic(psid): while psid>=0: print(psid) #读取当前相册中的图片列表 # ;hostuin=185763858#185763858/list/album?starttime=06 #url = ";+qzonetoken+"&g_tk="+g_tk+"&uin="+album_uin+"&albumid="+album['pic']['albumid'].encode('utf-8')+"&ps=0" url = ";+qzonetoken+"&g_tk="+g_tk+"&uin="+album_uin+"&albumid="+album['pic']['albumid']+"&ps="+str(psid) print("qzonetoken :"+qzonetoken) r = s.get(url) photo_datas = j('utf-8')) # for T in photo_datas['data']['photos']: # for pic in photo_datas['data']['photos'][T]: # # print ('图片名:'+pic['picname'].encode('utf-8')+',url:'+pic['1']['url'].encode('utf-8')) # print('图片名:' + pic['picname']+ ',url:' + pic['1']['url']) for T in photo_datas['data']['photos']: for pic in photo_datas['data']['photos'][T]: # print ('图片名:'+pic['picname'].encode('utf-8')+',url:'+pic['1']['url'].encode('utf-8')) # print('图片名:' + pic['shoottime']+ ',url:' + pic['videodata']['videourl']) # print('视频名:' + ('%Y-%m-%d %H:%M:%S', (pic['shoottime'])) + ',url:' + pic['videodata']['videourl']) if pic['videodata']['videourl'].strip(): dlpicname = ('%Y-%m-%d %H:%M:%S', (pic['shoottime'])).replace(':', '_')#+'_'+pic['videodata']['videoid'] dlurl = pic['videodata']['videourl'] qvidfilename =dlpicname + '.mp4' #如果有文件名相同的跳过循环 if qvidfilename in os.listdir(path): continue dest_dir = os.(path, qvidfilename) print('视频名' + dlpicname + ', url:' + dlurl) #urllib.reque(dlurl, dest_dir) # cdata = urllib.reque(dlurl, headers=qqheaders) # urllib.reque(cdata, dest_dir) # downLoadFileFromUrl(dest_dir,dlurl) opener = urllib.reque() o =[('User-Agent', 'Mozilla (Macintosh; Intel Mac OS X 10_12_3) AppleWebKi (KHTML, like Gecko) Chrome Safari;)] urllib.reque(opener) urllib.reque( dlurl, dest_dir) # print( '视频名'+ ('%Y-%m-%d %H:%M:%S', (pic['shoottime']))+', url:' +pic['videodata']['videourl']) else: dlpicname = ('%Y-%m-%d %H:%M:%S', (pic['shoottime'])).replace(':', '_') dlurl = pic['1']['url'] qpicfilename=dlpicname + '.jpg' # 如果有文件名相同的跳过循环 if qpicfilename in os.listdir(path): continue dest_dir = os.(path, qpicfilename) print('图片名' + dlpicname + ', url:' + dlurl) # cdata = urllib.reque(dlurl, headers=qqheaders) # urllib.reque(cdata, dest_dir) opener = urllib.reque() o =[('User-Agent', 'Mozilla (Macintosh; Intel Mac OS X 10_12_3) AppleWebKi (KHTML, like Gecko) Chrome Safari;)] urllib.reque(opener) urllib.reque( dlurl, dest_dir) # downLoadFileFromUrl(dest_dir, dlurl) # print('图片名:' + ('%Y-%m-%d %H:%M:%S', (pic['shoottime']))+ ',url:' + pic['1']['url']) # print(photo_datas) psid=psid-20 print ("="*10) for album in data['data']['vFeeds']: #print ('相册名:'+album['pic']['albumname'].encode('utf-8')) print('相册名:' + album['pic']['albumname']) #print ('相册id:'+album['pic']['albumid'].encode('utf-8')) print('相册id:' + album['pic']['albumid']) #print ('图片数量:' + str(album['pic']['albumnum'])) print('图片数量:' + str(album['pic']['albumnum'])) print ('开始下载相册图片:') getPic(dlpsnum)QQ空间相册
通过以上的努力,终于可以把把QQ里的几千络宝宝照片,下载下来了,相当的高兴,不是吗
#备注:以上获取QQzone Key等方法,来源于互联网。