python拉钩网数据爬取
spider.py#encoding:utf-8
#pipinstallbs4
#pipinstallrequests
#sudopipinstallxxx
#显示当前所有安装的库
#piplist
#sudoeasy_installpip
importrequests
frombs4importBeautifulSoup
importjson
importtime
defcrawl_detail(id):
url='https://www.lagou.com/jobs/%s.html'%id
headers={
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/59.0.3071.115Safari/537.36'
}
req=requests.get(url,headers=headers)
soup=BeautifulSoup(req.content,'lxml')
job_bt=soup.find('dd',attrs={'class':'job_bt'})
returnjob_bt.text
defmain():
headers={
'User-Agent':'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/59.0.3071.115Safari/537.36',
'Host':'www.lagou.com',
'Referer':'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
'X-Anit-Forge-Code':'0',
'X-Anit-Forge-Token':None,
'X-Requested-With':'XMLHttpRequest'
}
form_data={
'first':'true',
'pn':'1',
'kd':'python'
}
#result=requests.post('https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false',headers=headers,data=form_data)
#json_result=result.json()
#positions=json_result['content']['positionResult']['result']
positions=[]
forxinrange(1,5):
form_data={
'first':'true',
'pn':x,
'kd':'python'
}
result=requests.post('https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false',headers=headers,data=form_data)
json_result=result.json()
printjson_result
print'='*50
page_positions=json_result['content']['positionResult']['result']
forpositioninpage_positions:
#printposition
#print'-'*30
#先把需要的信息拿到,不需要的就不要了
position_dict={
'position_name':position['positionName'],
'work_year':position['workYear'],
'salary':position['salary'],
'district':position['district'],
'company_name':position['companyFullName'],
}
position_id=position['positionId']
#拿到这个position,然后再去爬这个职位的详情页面
position_detail=crawl_detail(position_id)
position_dict['position_detail']=position_detail
positions.append(position_dict)
#出现您操作太频繁,请稍后再试的解决办法。
#1.要么把sleep时间改大一点
#2.每次请求不要请求这么多,分多次请求
time.sleep(5)
line=json.dumps(positions,ensure_ascii=False)
withopen('lagou.json','w')asfp:
fp.write(line.encode('utf-8'))
if__name__=='__main__':
main()
#crawl_detail('3265286')
#selenium+PhtonmJS/ChromeDriverspider_selenium.py#encoding:utf-8
fromseleniumimportwebdriver
frombs4importBeautifulSoup
#chromedriver
#PhantomJS
driver=webdriver.PhantomJS(r'C:\Users\hynev\develop\phantomjs\bin\phantomjs.exe')
driver.get('https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=')
printdriver.find_element_by_class_name('item_con_list').click()本文转自:https://www.tongpankt.com/7486