首页 👨‍💻计算机,🐍Python

51_job爬虫代码

爬虫上

首先爬取两千页每页的链接ID

image-20201023121119971

可以再sciprt标签找到 jobid

image-20201023121243708

爬取jobid生成一个url文档,为下一部分拼接链接做准备

import requests
import chardet
from bs4 import BeautifulSoup
import csv
from openpyxl import Workbook
import json
import random
import threading
import time
def getOnePageInfo(url):
    # 访问链接
    res=requests.get(url,
                     headers={'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}
                    )
    # 转为beautifulsoup对象
    soup=BeautifulSoup(res.text,'html.parser')
    
 
    # 那么我们只能按照实际得到的对象来找信息
    allstring=soup.find_all('script')[-4].string
    # allstring=soup.find_all('script')[-4].text
 
 
    # 1:使用 = 分割1次, 的第二个值就是所有数据
    data=allstring.split('=',1)[-1]
    
    # 2 : 
    index=allstring.find('{')
    data2=allstring[index:]
 
    # 1使用eval()将字符串转换为相关数据
    dict_data=eval(data)
 
    bigdata = []
    for each in dict_data['engine_search_result']:
        oneInfo=[]
        # 职位名 job_name
        oneInfo.append(each.get('jobid'))
        #将最后一条信息放入bigdata
        bigdata.append(oneInfo)       
 
    return bigdata
# 存储二维列表专用类
class MySave():
    def __init__(self):
        pass
    def saveToCsv(self,data,fileName:str,mode='w'):
        with open(fileName,mode=mode,encoding='utf-8',newline='')as f:
            csvfile=csv.writer(f)
            #写入data
            for each in data:
                csvfile.writerow(each)
            print(fileName,'存储完成')
    def saveToExcel(self,data,fileName):
        #实例化工作对象
        wb=Workbook()
        #准备工作表
        sheet=wb.active
        #写入数据
        for each in data:
            sheet.append(each)
        wb.save(fileName)
        print(fileName,'存储完成')
save = MySave()
 
def getJobInfo(startNum,endNum):
    for i in range(startNum,endNum):
        time.sleep(random.randint(1,3))
    #准备url链接
        url='https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590,2,{i}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
        print(f'正在抓取第{i}页')
        #运行函数访问url,返回数据
        data = getOnePageInfo(url)
        #存储到csv
        save.saveToCsv(data,'job万条id数据.csv','a')

#开四个线程
t1 = threading.Thread(target=getJobInfo , args=(1,501))
t2 = threading.Thread(target=getJobInfo , args=(502,1001))
t3 = threading.Thread(target=getJobInfo , args=(1002,1501))
t4 = threading.Thread(target=getJobInfo , args=(1502,2001))
 
t1.start()
t2.start()
t3.start()
t4.start()

爬虫下

循环遍历读取上半部分id,拼接成新的链接

比如这是任意一夜的一个职位链接https://jobs.51job.com/hangzhou/126390246.html?

他的jobid是可变的其他部分是不变的,所以只需要把上半部分写好的id拼接一下就可以了

拼接后新的链接
'https://jobs.51job.com/guangzhou-thq/{i}.html?'
import requests
import chardet
from bs4 import BeautifulSoup
import csv
from openpyxl import Workbook
import json
import random
import threading
import time
import re
url_list = []
#打开url文档循环添加到列表
with open('job万条id数据.txt','r') as f:
    for i in f.readlines():
        i = i.strip()
        url_list.append(i)
#https://jobs.51job.com/guangzhou-thq/123150402.html?
#进行链接拼接。将链接编码列表添加到下方列表后边,组成新的链接
the_url_list = []
for i in url_list:
    the_url_list.append(f'https://jobs.51job.com/guangzhou-thq/{i}.html?')

def spider(url):
    last_list=[]
    headers = {
              'User-Agent':'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36 Edg/86.0.622.48'}        
    response = requests.get(url=url,headers = headers)
    response.encoding = 'gbk'
    soup=BeautifulSoup(response.text,"html.parser")
    #找到大概位置
    result=soup.find(class_='cn')
    #职位名称
    name=result.find('h1').text.strip()
    #薪资
    xinzi=result.find('strong').text.replace('/月','').strip()
    #公司名字
    mingzi=result.find('a',class_='catn').text.strip()
    #信息
    xinxi=result.find('p',attrs={'class': 'msg'}).text.replace('\xa0','').strip()
    #福利
    fuli=result.find(class_='t1').text.strip()
    #职位信息
bmsg=soup.find(class_='job_msg').text.replace('\n','').replace('\xa0','').replace("【",'').replace("】",'').strip()
    last_list.append([name,xinzi,mingzi,xinxi,fuli,bmsg])
    #写入csv
    with open('51_job.csv', 'a', encoding='utf-8-sig', newline='') as csvFile:
            csv.writer(csvFile).writerow(last_list)
count = 1#设置初始值
for i in the_url_list:
    spider(i)
    count+=1
    #循环遍历上边新连接列表
    #设置爬虫时间
    print('正在爬取第%s条'% count)



文章评论

目录