使用Python抓取“链家上海二手房”网页信息,在不考虑迸发多线程的前提下,加入单个网页停留0.2s反爬,总计耗时58s√完成所有信息格式化爬取【在Baidu AIStudio完成demo测试】
0 思路分析
- 网页结构分析
1.1单个页面解析如下,单个房源字段 = ["房源标题","房源链接","小区","板块","房源总价","房源单价","具体信息","关注度"]
1.2 url地址及总页数
2. Demo设计&验证【以第1页为例】
2.1 解析页面源代码
在Chrome浏览器开发者模式下,可以知道原始链接地址(1/),其页面代码已经足够反馈需要的页面信息
2.2 在Baidu AIStudio里验证demo
2.2.1导入需要的库文件
import pandas as pd
import numpy as np
import lxml
import requests
from lxml import etree
from bs4 import BeautifulSoup as BS
import re
import time
2.2.2 解析HTML代码
2.2.3 解析xpath【选择xpath而非beau原因:易于结构化和验证】
房源标题xpath
/html/body/div[4]/div[1]/ul/li/div[1]/div[1]/a/text()
/html/body/div[4]/div[1]/ul/li[1]/div[1]/div[1]/a/text()
/html/body/div[4]/div[1]/ul/li[2]/div[1]/div[1]/a/text()
房源链接xpath
/html/body/div[4]/div[1]/ul/li/div[1]/div[1]/a/@href
/html/body/div[4]/div[1]/ul/li[#]/div[1]/div[1]/a/@href
/html/body/div[4]/div[1]/ul/li[1]/div[1]/div[1]/a/@href
/html/body/div[4]/div[1]/ul/li[2]/div[1]/div[1]/a/@href
小区xpath
/html/body/div[4]/div[1]/ul/li/div[1]/div[2]/div/a[1]/text()
/html/body/div[4]/div[1]/ul/li[#]/div[1]/div[2]/div/a[1]/text()
/html/body/div[4]/div[1]/ul/li[1]/div[1]/div[2]/div/a[1]/text()
/html/body/div[4]/div[1]/ul/li[2]/div[1]/div[2]/div/a[1]/text()
板块xpath
/html/body/div[4]/div[1]/ul/li/div[1]/div[2]/div/a[2]/text()
/html/body/div[4]/div[1]/ul/li[#]/div[1]/div[2]/div/a[2]/text()
/html/body/div[4]/div[1]/ul/li[1]/div[1]/div[2]/div/a[2]/text()
/html/body/div[4]/div[1]/ul/li[2]/div[1]/div[2]/div/a[2]/text()
具体信息xpath
/html/body/div[4]/div[1]/ul/li/div[1]/div[3]/div/text()
/html/body/div[4]/div[1]/ul/li[#]/div[1]/div[3]/div/text()
/html/body/div[4]/div[1]/ul/li[1]/div[1]/div[3]/div/text()
/html/body/div[4]/div[1]/ul/li[2]/div[1]/div[3]/div/text()
房源总价xpath
/html/body/div[4]/div[1]/ul/li/div[1]/div[6]/div[1]/span/text()
/html/body/div[4]/div[1]/ul/li[#]/div[1]/div[6]/div[1]/span/text()
/html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span/text()
/html/body/div[4]/div[1]/ul/li[2]/div[1]/div[6]/div[1]/span/text()
房源单价xpath
/html/body/div[4]/div[1]/ul/li/div[1]/div[6]/div[2]/span/text()
/html/body/div[4]/div[1]/ul/li[#]/div[1]/div[6]/div[2]/span/text()
/html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[2]/span/text()
/html/body/div[4]/div[1]/ul/li[2]/div[1]/div[6]/div[2]/span/text()
关注度xpath
/html/body/div[4]/div[1]/ul/li/div[1]/div[4]/text()
/html/body/div[4]/div[1]/ul/li[#]/div[1]/div[4]/text()
/html/body/div[4]/div[1]/ul/li[1]/div[1]/div[4]/text()
/html/body/div[4]/div[1]/ul/li[2]/div[1]/div[4]/text()
- 使用xpath验证各要素信息提取是否与网页信息保持一致,且彼此数量相等【均为30】
至此demo验证完毕,准备进一步封装函数
3 封装程序
#定义抓取每页信息的函数
#字段= ["房源标题","房源链接","小区","板块","房源总价","房源单价","具体信息","关注度"]
def get_page(page:int):
url= f"{page}/"
headers = {"User-Agent": "Mozilla (Windows NT 10.0; Win64; x64) AppleWebKi (KHTML, like Gecko) Chrome Safari;}
rsp = reque(url=url,headers=headers).text
html = e(rsp)
df = (data=None,columns=["房源标题","房源链接","小区","板块","房源总价","房源单价","具体信息","关注度"])
df["房源标题"]= ("/html/body/div[4]/div[1]/ul/li/div[1]/div[1]/a/text()")
df["房源链接"]= ("/html/body/div[4]/div[1]/ul/li/div[1]/div[1]/a/@href")
df["小区"] = ("/html/body/div[4]/div[1]/ul/li/div[1]/div[2]/div/a[1]/text()")
df["板块"] = ("/html/body/div[4]/div[1]/ul/li/div[1]/div[2]/div/a[2]/text()")
df["房源总价"]= list(map(lambda x:str(x)+"万",("/html/body/div[4]/div[1]/ul/li/div[1]/div[6]/div[1]/span/text()")))
df["房源单价"] = list(map(lambda x:in(r"\d+",x)[0]),("/html/body/div[4]/div[1]/ul/li/div[1]/div[6]/div[2]/span/text()")))
df["具体信息"] = ("/html/body/div[4]/div[1]/ul/li/div[1]/div[3]/div/text()")
df["关注度"] = ("/html/body/div[4]/div[1]/ul/li/div[1]/div[4]/text()")
return df
print(f"第{page}页解析完成")
) # 暂停0.2s反爬
4 运行
使用以下代码完成爬取信息,并最终导出结果,共有3000行,8列:
- 3000行,符合“每页30行”x “100页”= 3000
- 8列,符合选取的8个字段 = ["房源标题","房源链接","小区","板块","房源总价","房源单价","具体信息","关注度"]
附录
- 将Baidu AIStudio的ipynb notebook代码,重新封装为py文件,可以用于在本地运行【下面代码可以直接复制粘贴】
#调包侠
import pandas as pd
import numpy as np
import lxml
import requests
from lxml import etree
from bs4 import BeautifulSoup as BS
import re
import time
#定义抓取每页信息的函数
#字段= ["房源标题","房源链接","小区","板块","房源总价","房源单价","具体信息","关注度"]
def get_page(page:int):
url= f"{page}/"
headers = {"User-Agent": "Mozilla (Windows NT 10.0; Win64; x64) AppleWebKi (KHTML, like Gecko) Chrome Safari;}
rsp = reque(url=url,headers=headers).text
html = e(rsp)
df = (data=None,columns=["房源标题","房源链接","小区","板块","房源总价","房源单价","具体信息","关注度"])
df["房源标题"]= ("/html/body/div[4]/div[1]/ul/li/div[1]/div[1]/a/text()")
df["房源链接"]= ("/html/body/div[4]/div[1]/ul/li/div[1]/div[1]/a/@href")
df["小区"] = ("/html/body/div[4]/div[1]/ul/li/div[1]/div[2]/div/a[1]/text()")
df["板块"] = ("/html/body/div[4]/div[1]/ul/li/div[1]/div[2]/div/a[2]/text()")
df["房源总价"]= list(map(lambda x:str(x)+"万",("/html/body/div[4]/div[1]/ul/li/div[1]/div[6]/div[1]/span/text()")))
df["房源单价"] = list(map(lambda x:in(r"\d+",x)[0]),("/html/body/div[4]/div[1]/ul/li/div[1]/div[6]/div[2]/span/text()")))
df["具体信息"] = ("/html/body/div[4]/div[1]/ul/li/div[1]/div[3]/div/text()")
df["关注度"] = ("/html/body/div[4]/div[1]/ul/li/div[1]/div[4]/text()")
return df
print(f"第{page}页解析完成")
) # 暂停0.2s反爬
#定义主函数
def main():
df = (data=None) # 新建空DataFrame
df = df.append([get_page(i) for i in range(1,100+1)],ignore_index=True) # 调用封装的函数get_page
df.to_excel("./链家上海_二手房爬取_20210524.xlsx",index=False) #写入Excel
#运行主函数
if __name__ == "__main__":
start = ()
print("开始")
main()
end = ()
print("100页链家上海二手房信息爬取并导出完成,共耗时{0}s".format(format(end-start,".2f")))