俺去也小说 Python 爬虫中国行政区画信息爬取

发布日期：2024-08-03 22:41 点击次数：164

绪论

业务部门需要更新最新的宇宙区画信息数据俺去也小说，建立基础数据库，泰斗数据虽然是国度统计局的官方数据

http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/

图片俺去也小说

这里要作念的，便是将其爬取下来。环境准备

咱们使用python用具爬取数据，并将其保存为Excel：

python环境，略过；

干系依赖requests、BeautifulSoup、pandas、threading、os；requests 用于web肯求，并获得页面数据；BeautifulSoup 索要页面数据；pandas 数据分析，此处只是用来便捷数据导出;threading 多线程爬取;

代码片断1、界说地址信息对象

封装贯通后的数据，areainfo

class areainfo(): def __init__(self): self.areacode='' #行政区画编码 self.areaname='' #行政区画称呼 self.parentcode='' #父级区画编码 self.leve='' #地址级别 self.href='' #不竭地址 def as_dict(self): return {'areacode': self.areacode, 'areaname': self.areaname, 'parentcode': self.parentcode,'leve': self.leve,'href': self.href}

2、地址贯通对象

将通盘地址贯通表情封装为一个类，包含 web肯求、web贯通等表情

图片

2.1 获得web信息

      #classname 页面便签 ,parnetcode 父级区画编码,leve 刻下区画等第     def initAreainfo(self,url,classname,parnetcode,leve):     print( '页面便签 %s -- 地址等第 %s --- url  %s  \n' % (classname,leve,url))     soup = self.getUrl(url)     if soup is None:         return  None     classes = soup.find_all(name='tr', attrs={'class': classname})  # 按照字典的格局给attrs参数赋值     list = []     for classesoup in classes:         group = classesoup.find_all('a')         entity = areainfo()         entity.leve = leve         entity.parentcode = parnetcode         if len(group) > 0:             entity.href = group[0]['href']             entity.areacode = group[0].string             entity.areaname = group[1].string         else:             tds = classesoup.find_all('td')             entity.href = ''             if len(tds)==2 :                 entity.areacode = tds[0].string                 entity.areaname = tds[1].string             if len(tds)==3:                 entity.areacode = tds[0].string                 entity.areaname = tds[2].string                 entity.parentcode = parnetcode         list.append(entity)     return list

该处将非常的肯求存到err.log文献中，以便于后期读取非常不竭，补充丢失数据。

2.2 web信息贯通 #classname 页面便签 ,parnetcode 父级区画编码,leve 刻下区画等第 def initAreainfo(self,url,classname,parnetcode,leve): print( '页面便签 %s -- 地址等第 %s --- url %s \n' % (classname,leve,url)) soup = self.getUrl(url) if soup is None: return None classes = soup.find_all(name='tr', attrs={'class': classname}) # 按照字典的格局给attrs参数赋值 list = [] for classesoup in classes: group = classesoup.find_all('a') entity = areainfo() entity.leve = leve entity.parentcode = parnetcode if len(group) > 0: entity.href = group[0]['href'] entity.areacode = group[0].string entity.areaname = group[1].string else: tds = classesoup.find_all('td') entity.href = '' if len(tds)==2 : entity.areacode = tds[0].string entity.areaname = tds[1].string if len(tds)==3: entity.areacode = tds[0].string entity.areaname = tds[2].string entity.parentcode = parnetcode list.append(entity) return list

网页中，每一层级区画信息的便签不同，可使用浏览器F12投入调试模式识别。BeautifulSoup 通过对标签class索要，获得需要的区画信息数据。

图片

2.3 区画信息索要

各等第区画信息索要，区分调用2.2的表情进行贯通。每个表情复返地址list

    '''    获得一级省份 ''' def getPronvice(self):     soup = self.getUrl(self.base)     if soup is None :         return None     provincesoups = soup.find_all(name='tr', attrs={'class': 'provincetr'})  # 按照字典的格局给attrs参数赋值     provinceList=[]     for provincesoup in provincesoups:         for k in provincesoup.find_all('a'):             province = areainfo()             province.href=k['href']             province.areaname= k.get_text()             province.areacode= k['href'].replace('.html','0000')             province.parentcode='0'             province.leve = '1'             print(province.__dict__)             provinceList.append(province)     return provinceList '''     获得二级城市 ''' def getCity(self,parent):     url=self.base + parent.href     list =self.initAreainfo(url,'citytr',parent.areacode,'2')     return list '''    获得三级城市 ''' def getCounty(self,parent):     url = self.base + parent.href     list  = self.initAreainfo(url,'countytr',parent.areacode,'3')     return  list '''    获得四级地址 ''' def getTown(self,parent):     url = parent.href     if url=='' :         return None     url = self.base + parent.areacode[0:2]+'/'+parent.href     list = self.initAreainfo(url,'towntr',parent.areacode,'4')     return  list '''   获得五级地址 ''' def getVillagetr(self,parent):     url = parent.href     if url=='' :         return None     url = self.base + parent.areacode[0:2]+'/'+parent.areacode[2:4]+'/'+parent.href     list = self.initAreainfo(url,'villagetr',parent.areacode,'5')     return  list

2.4 省份数据封装

获得一个省下边通盘地址数据

''' 通过省份获得该省份下通盘地址信息 ''' def finAllPronvinceCity(self,pro,dir): listall=[] listall.append(pro) citylist = self.getCity(pro) for city in citylist : listall.append(city) #print(city.__dict__) conlist = self.getCounty(city) if conlist is not None : for county in conlist: #print(county.__dict__) listall.append(county) townlist = self.getTown(county) if townlist is not None: for town in townlist: #print(town.__dict__) listall.append(town) villagelist = self.getVillagetr(town) if villagelist is not None: listall.extend(villagelist) df = pd.DataFrame([x.as_dict() for x in listall]) #print(df) isExists = os.path.exists(dir) if not isExists: os.makedirs(dir) filepath = os.path.join(dir,pro.areaname+'.xlsx'); writer = pd.ExcelWriter(filepath) df.to_excel(writer, float_format='%.5f') writer.save()2.5 线程封装

   '''       异递次用    '''    def ruanthread(self):        provinces = self.getPronvice()        for province in provinces:            threading.Thread(target= self.finAllPronvinceCity, args=(province,'F://areainfo')).start()

2.6 全能的MAIN if __name__ == '__main__': china_city=china_city() china_city.ruanthread()

2.7 补充-err.log 数据责罚

构建新的表情，只是贯通区画信息。该表情不太完善，仅参考

欧美BT

  def getCityOnly(self,url,str,leve):       list = self.initAreainfo(url,str,'',leve)       return  list

输出数据def errFileRe(self): listother=[] with open('err.log', 'r') as file: line = file.readline() while line: # isspace()表情判断当该行是空行时，跳过该行 if line.isspace(): line = file.readline() else: ''' 不是空行时，对每一瞥进行的操作 ''' line = line.replace('\n', '') list = self.getCityOnly(line, 'villagetr', '5') listother.extend(list) line = file.readline() return listother入手

图片

导出数据列表

图片

数据表情

图片

err.log日记：

图片

齐全代码

附上齐全代码

import  requestsfrom bs4 import BeautifulSoupimport pandas as pdimport threadingimport osclass areainfo():    def __init__(self):        self.areacode=''   #行政区画编码        self.areaname=''   #行政区画称呼        self.parentcode='' #父级区画编码        self.leve=''       #地址级别        self.href=''       #不竭地址    def as_dict(self):        return {'areacode': self.areacode, 'areaname': self.areaname, 'parentcode': self.parentcode,'leve': self.leve,'href': self.href}class china_city():    def __init__(self):        self.base = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/'    '''      获得web信息    '''    def getUrl(self,url):        try:            headers = {                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}            resp = requests.get(url, headers=headers)            resp.encoding = 'gbk'            text = resp.text            soup = BeautifulSoup(text, 'html.parser')            return  soup        #纪录非常肯求        except  Exception  as e:            print(e)            with open('err.log', 'a') as file:  # ”w'代表着每次入手齐遮掩本体                file.write(url  + '\n')            return  None    '''       获得一级省份    '''    def getPronvice(self):        soup = self.getUrl(self.base)        if soup is None :            return None        provincesoups = soup.find_all(name='tr', attrs={'class': 'provincetr'})  # 按照字典的格局给attrs参数赋值        provinceList=[]        for provincesoup in provincesoups:            for k in provincesoup.find_all('a'):                province = areainfo()                province.href=k['href']                province.areaname= k.get_text()                province.areacode= k['href'].replace('.html','0000')                province.parentcode='0'                province.leve = '1'                print(province.__dict__)                provinceList.append(province)        return provinceList    '''        获得二级城市    '''    def getCity(self,parent):        url=self.base + parent.href        list =self.initAreainfo(url,'citytr',parent.areacode,'2')        return list    '''       获得三级城市    '''    def getCounty(self,parent):        url = self.base + parent.href        list  = self.initAreainfo(url,'countytr',parent.areacode,'3')        return  list    '''       获得四级地址    '''    def getTown(self,parent):        url = parent.href        if url=='' :            return None        url = self.base + parent.areacode[0:2]+'/'+parent.href        list = self.initAreainfo(url,'towntr',parent.areacode,'4')        return  list    '''      获得五级地址    '''    def getVillagetr(self,parent):        url = parent.href        if url=='' :            return None        url = self.base + parent.areacode[0:2]+'/'+parent.areacode[2:4]+'/'+parent.href        list = self.initAreainfo(url,'villagetr',parent.areacode,'5')        return  list    '''       soup贯通    '''    def initAreainfo(self,url,classname,parnetcode,leve):        print( '页面便签 %s -- 地址等第 %s --- url  %s  \n' % (classname,leve,url))        soup = self.getUrl(url)        if soup is None:            return  None        classes = soup.find_all(name='tr', attrs={'class': classname})  # 按照字典的格局给attrs参数赋值        list = []        for classesoup in classes:            group = classesoup.find_all('a')            entity = areainfo()            entity.leve = leve            entity.parentcode = parnetcode            if len(group) > 0:                entity.href = group[0]['href']                entity.areacode = group[0].string                entity.areaname = group[1].string            else:                tds = classesoup.find_all('td')                entity.href = ''                if len(tds)==2 :                    entity.areacode = tds[0].string                    entity.areaname = tds[1].string                if len(tds)==3:                    entity.areacode = tds[0].string                    entity.areaname = tds[2].string                    entity.parentcode = parnetcode            list.append(entity)        return list    '''      通过省份获得该省份下通盘地址信息    '''    def finAllPronvinceCity(self,pro,dir):        listall=[]        listall.append(pro)        citylist =  self.getCity(pro)        for city in citylist :            listall.append(city)            #print(city.__dict__)            conlist =  self.getCounty(city)            if conlist is not None :                for county in conlist:                    #print(county.__dict__)                    listall.append(county)                    townlist = self.getTown(county)                    if townlist is not None:                        for town in townlist:                            #print(town.__dict__)                            listall.append(town)                            villagelist = self.getVillagetr(town)                            if villagelist is not None:                                listall.extend(villagelist)        df = pd.DataFrame([x.as_dict() for x in listall])        #print(df)        isExists = os.path.exists(dir)        if not isExists:            os.makedirs(dir)        filepath = os.path.join(dir,pro.areaname+'.xlsx');        writer = pd.ExcelWriter(filepath)        df.to_excel(writer, float_format='%.5f')        writer.save()    '''       异递次用    '''    def ruanthread(self):        provinces = self.getPronvice()        for province in provinces:            threading.Thread(target= self.finAllPronvinceCity, args=(province,'F://areainfo')).start()if __name__ == '__main__':    china_city=china_city()    china_city.ruanthread()

本站仅提供存储劳动，通盘本体均由用户发布，如发现存害或侵权本体，请点击举报。

俺去也小说 Python 爬虫中国行政区画信息爬取

热点资讯

相关资讯

俺去也小说 Python 爬虫 中国行政区画信息爬取

热点资讯

相关资讯

俺去也小说 Python 爬虫中国行政区画信息爬取