俺去也小说 Python 爬虫 中国行政区画信息爬取
发布日期:2024-08-03 22:41 点击次数:154
绪论
业务部门需要更新最新的宇宙区画信息数据俺去也小说,建立基础数据库,泰斗数据虽然是国度统计局的官方数据
http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/
图片俺去也小说
这里要作念的,便是将其爬取下来。环境准备咱们使用python用具爬取数据,并将其保存为Excel:
python环境 ,略过;
干系依赖requests、BeautifulSoup、pandas、threading、os;requests 用于web肯求,并获得页面数据;BeautifulSoup 索要页面数据;pandas 数据分析,此处只是用来便捷数据导出;threading 多线程爬取;
代码片断1、界说地址信息对象封装贯通后的数据,areainfo
class areainfo(): def __init__(self): self.areacode='' #行政区画编码 self.areaname='' #行政区画称呼 self.parentcode='' #父级区画编码 self.leve='' #地址级别 self.href='' #不竭地址 def as_dict(self): return {'areacode': self.areacode, 'areaname': self.areaname, 'parentcode': self.parentcode,'leve': self.leve,'href': self.href}2、地址贯通对象
将通盘地址贯通表情封装为一个类,包含 web肯求、web贯通等表情
图片
2.1 获得web信息
#classname 页面便签 ,parnetcode 父级区画编码,leve 刻下区画等第 def initAreainfo(self,url,classname,parnetcode,leve): print( '页面便签 %s -- 地址等第 %s --- url %s \n' % (classname,leve,url)) soup = self.getUrl(url) if soup is None: return None classes = soup.find_all(name='tr', attrs={'class': classname}) # 按照字典的格局给attrs参数赋值 list = [] for classesoup in classes: group = classesoup.find_all('a') entity = areainfo() entity.leve = leve entity.parentcode = parnetcode if len(group) > 0: entity.href = group[0]['href'] entity.areacode = group[0].string entity.areaname = group[1].string else: tds = classesoup.find_all('td') entity.href = '' if len(tds)==2 : entity.areacode = tds[0].string entity.areaname = tds[1].string if len(tds)==3: entity.areacode = tds[0].string entity.areaname = tds[2].string entity.parentcode = parnetcode list.append(entity) return list
该处将非常的肯求存到err.log文献中,以便于后期读取非常不竭,补充丢失数据。
2.2 web信息贯通 #classname 页面便签 ,parnetcode 父级区画编码,leve 刻下区画等第 def initAreainfo(self,url,classname,parnetcode,leve): print( '页面便签 %s -- 地址等第 %s --- url %s \n' % (classname,leve,url)) soup = self.getUrl(url) if soup is None: return None classes = soup.find_all(name='tr', attrs={'class': classname}) # 按照字典的格局给attrs参数赋值 list = [] for classesoup in classes: group = classesoup.find_all('a') entity = areainfo() entity.leve = leve entity.parentcode = parnetcode if len(group) > 0: entity.href = group[0]['href'] entity.areacode = group[0].string entity.areaname = group[1].string else: tds = classesoup.find_all('td') entity.href = '' if len(tds)==2 : entity.areacode = tds[0].string entity.areaname = tds[1].string if len(tds)==3: entity.areacode = tds[0].string entity.areaname = tds[2].string entity.parentcode = parnetcode list.append(entity) return list网页中,每一层级区画信息的便签不同,可使用浏览器F12投入调试模式识别。BeautifulSoup 通过对标签class索要,获得需要的区画信息数据。
eg
图片
2.3 区画信息索要各等第区画信息索要,区分调用2.2的表情进行贯通。每个表情复返地址list
''' 获得一级省份 ''' def getPronvice(self): soup = self.getUrl(self.base) if soup is None : return None provincesoups = soup.find_all(name='tr', attrs={'class': 'provincetr'}) # 按照字典的格局给attrs参数赋值 provinceList=[] for provincesoup in provincesoups: for k in provincesoup.find_all('a'): province = areainfo() province.href=k['href'] province.areaname= k.get_text() province.areacode= k['href'].replace('.html','0000') province.parentcode='0' province.leve = '1' print(province.__dict__) provinceList.append(province) return provinceList ''' 获得二级城市 ''' def getCity(self,parent): url=self.base + parent.href list =self.initAreainfo(url,'citytr',parent.areacode,'2') return list ''' 获得三级城市 ''' def getCounty(self,parent): url = self.base + parent.href list = self.initAreainfo(url,'countytr',parent.areacode,'3') return list ''' 获得四级地址 ''' def getTown(self,parent): url = parent.href if url=='' : return None url = self.base + parent.areacode[0:2]+'/'+parent.href list = self.initAreainfo(url,'towntr',parent.areacode,'4') return list ''' 获得五级地址 ''' def getVillagetr(self,parent): url = parent.href if url=='' : return None url = self.base + parent.areacode[0:2]+'/'+parent.areacode[2:4]+'/'+parent.href list = self.initAreainfo(url,'villagetr',parent.areacode,'5') return list
2.4 省份数据封装
获得一个省下边通盘地址数据
''' 通过省份获得该省份下通盘地址信息 ''' def finAllPronvinceCity(self,pro,dir): listall=[] listall.append(pro) citylist = self.getCity(pro) for city in citylist : listall.append(city) #print(city.__dict__) conlist = self.getCounty(city) if conlist is not None : for county in conlist: #print(county.__dict__) listall.append(county) townlist = self.getTown(county) if townlist is not None: for town in townlist: #print(town.__dict__) listall.append(town) villagelist = self.getVillagetr(town) if villagelist is not None: listall.extend(villagelist) df = pd.DataFrame([x.as_dict() for x in listall]) #print(df) isExists = os.path.exists(dir) if not isExists: os.makedirs(dir) filepath = os.path.join(dir,pro.areaname+'.xlsx'); writer = pd.ExcelWriter(filepath) df.to_excel(writer, float_format='%.5f') writer.save()2.5 线程封装''' 异递次用 ''' def ruanthread(self): provinces = self.getPronvice() for province in provinces: threading.Thread(target= self.finAllPronvinceCity, args=(province,'F://areainfo')).start()2.6 全能的MAIN if __name__ == '__main__': china_city=china_city() china_city.ruanthread()
2.7 补充-err.log 数据责罚
构建新的表情,只是贯通区画信息。该表情不太完善,仅参考
欧美BTdef getCityOnly(self,url,str,leve): list = self.initAreainfo(url,str,'',leve) return list输出数据def errFileRe(self): listother=[] with open('err.log', 'r') as file: line = file.readline() while line: # isspace()表情判断当该行是空行时,跳过该行 if line.isspace(): line = file.readline() else: ''' 不是空行时,对每一瞥进行的操作 ''' line = line.replace('\n', '') list = self.getCityOnly(line, 'villagetr', '5') listother.extend(list) line = file.readline() return listother入手
图片
导出数据列表
图片
数据表情图片
err.log日记:图片
齐全代码附上齐全代码
import requestsfrom bs4 import BeautifulSoupimport pandas as pdimport threadingimport osclass areainfo(): def __init__(self): self.areacode='' #行政区画编码 self.areaname='' #行政区画称呼 self.parentcode='' #父级区画编码 self.leve='' #地址级别 self.href='' #不竭地址 def as_dict(self): return {'areacode': self.areacode, 'areaname': self.areaname, 'parentcode': self.parentcode,'leve': self.leve,'href': self.href}class china_city(): def __init__(self): self.base = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/' ''' 获得web信息 ''' def getUrl(self,url): try: headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'} resp = requests.get(url, headers=headers) resp.encoding = 'gbk' text = resp.text soup = BeautifulSoup(text, 'html.parser') return soup #纪录非常肯求 except Exception as e: print(e) with open('err.log', 'a') as file: # ”w'代表着每次入手齐遮掩本体 file.write(url + '\n') return None ''' 获得一级省份 ''' def getPronvice(self): soup = self.getUrl(self.base) if soup is None : return None provincesoups = soup.find_all(name='tr', attrs={'class': 'provincetr'}) # 按照字典的格局给attrs参数赋值 provinceList=[] for provincesoup in provincesoups: for k in provincesoup.find_all('a'): province = areainfo() province.href=k['href'] province.areaname= k.get_text() province.areacode= k['href'].replace('.html','0000') province.parentcode='0' province.leve = '1' print(province.__dict__) provinceList.append(province) return provinceList ''' 获得二级城市 ''' def getCity(self,parent): url=self.base + parent.href list =self.initAreainfo(url,'citytr',parent.areacode,'2') return list ''' 获得三级城市 ''' def getCounty(self,parent): url = self.base + parent.href list = self.initAreainfo(url,'countytr',parent.areacode,'3') return list ''' 获得四级地址 ''' def getTown(self,parent): url = parent.href if url=='' : return None url = self.base + parent.areacode[0:2]+'/'+parent.href list = self.initAreainfo(url,'towntr',parent.areacode,'4') return list ''' 获得五级地址 ''' def getVillagetr(self,parent): url = parent.href if url=='' : return None url = self.base + parent.areacode[0:2]+'/'+parent.areacode[2:4]+'/'+parent.href list = self.initAreainfo(url,'villagetr',parent.areacode,'5') return list ''' soup贯通 ''' def initAreainfo(self,url,classname,parnetcode,leve): print( '页面便签 %s -- 地址等第 %s --- url %s \n' % (classname,leve,url)) soup = self.getUrl(url) if soup is None: return None classes = soup.find_all(name='tr', attrs={'class': classname}) # 按照字典的格局给attrs参数赋值 list = [] for classesoup in classes: group = classesoup.find_all('a') entity = areainfo() entity.leve = leve entity.parentcode = parnetcode if len(group) > 0: entity.href = group[0]['href'] entity.areacode = group[0].string entity.areaname = group[1].string else: tds = classesoup.find_all('td') entity.href = '' if len(tds)==2 : entity.areacode = tds[0].string entity.areaname = tds[1].string if len(tds)==3: entity.areacode = tds[0].string entity.areaname = tds[2].string entity.parentcode = parnetcode list.append(entity) return list ''' 通过省份获得该省份下通盘地址信息 ''' def finAllPronvinceCity(self,pro,dir): listall=[] listall.append(pro) citylist = self.getCity(pro) for city in citylist : listall.append(city) #print(city.__dict__) conlist = self.getCounty(city) if conlist is not None : for county in conlist: #print(county.__dict__) listall.append(county) townlist = self.getTown(county) if townlist is not None: for town in townlist: #print(town.__dict__) listall.append(town) villagelist = self.getVillagetr(town) if villagelist is not None: listall.extend(villagelist) df = pd.DataFrame([x.as_dict() for x in listall]) #print(df) isExists = os.path.exists(dir) if not isExists: os.makedirs(dir) filepath = os.path.join(dir,pro.areaname+'.xlsx'); writer = pd.ExcelWriter(filepath) df.to_excel(writer, float_format='%.5f') writer.save() ''' 异递次用 ''' def ruanthread(self): provinces = self.getPronvice() for province in provinces: threading.Thread(target= self.finAllPronvinceCity, args=(province,'F://areainfo')).start()if __name__ == '__main__': china_city=china_city() china_city.ruanthread()本站仅提供存储劳动,通盘本体均由用户发布,如发现存害或侵权本体,请点击举报。