codes:tianyancha_crawler
爬取天眼查供应商信息
import requests from bs4 import BeautifulSoup import bs4 import traceback import numpy as np import pandas as pd import time import math
# 通用代码框架 def getHTMLText(url,kv):
try: r = requests.get(url,headers=kv,timeout=30) r.raise_for_status() # 如果状态不是200,引发HTTPError异常 r.encoding = r.apparent_encoding return r.text except: return "产生异常"
Part 1 获得公司在天眼查的url链接,保存到excel中 # 获得公司url链接id def getfirmID(fullName):
IDhrefList = [] for fname in fullName: # fullName从锐思数据库中下载 url = 'https://www.tianyancha.com/search?key=' firmIdURL = url + fname try: demo = getHTMLText(firmIdURL, kv) soup = BeautifulSoup(demo, "html.parser") a = soup.find_all('a')[1] IDhrefList.append(a.attrs['href']) except: IDhrefList.append(['error']) continue return IDhrefList
df = pd.read_excel(r'D:\Astock_name.xls',converters={'A_StkCd':str}) fullName = df['LComNm'].tolist()
# 获得公司证券代码和供应商客户数量 def getCompanyInfo(IDhrefList):
stockList = [] relativeCount = [] for companyURL in IDhrefList: try: demo = getHTMLText(companyURL,kv) soup = BeautifulSoup(demo, "html.parser")
# 获取公司简称和证券代码 shortName = soup.find_all('span', attrs={'class':'short-name'}) stkcd = soup.find_all('span', attrs={'class':'bond-nam'})
# 获取供应商和客户总数 scount = soup.find('div', attrs={'id':'nav-main-suppliesCount'}).find_all('span')[1].string if soup.find_all('div', attrs={'id':'nav-main-suppliesCount'}) != [] else 0 ccount = soup.find('div', attrs={'id':'nav-main-clientsCount'}).find_all('span')[1].string if soup.find_all('div', attrs={'id':'nav-main-clientsCount'}) != [] else 0 stockList.append([shortName[0].string, stkcd[0].string]) relativeCount.append([scount, ccount]) except: stockList.append(['error']) relativeCount.append(['error']) continue return stockList,relativeCount
# 获得供应商信息 def getSuppliesInfo(supplilesCount,firmID,kv):
suppliesList = [] for page in range(math.ceil(int(supplilesCount)/10)): # time.sleep(2) data = {'ps':'10','pn':1+page,'id':firmID, '_':''} url = "https://www.tianyancha.com/pagination/supplies.xhtml?" try: s = requests.session() r = s.get(url, params = data, headers=kv) r.raise_for_status() r.encoding = r.apparent_encoding demo = r.text soup = BeautifulSoup(demo, "html.parser") for tr in soup.find("tbody").children: if isinstance(tr, bs4.element.Tag): tds = tr("td") suppliesList.append([tds[-3].string, tr.find('img').attrs['alt'], tds[-5].string, tds[-2].string]) except: suppliesList.append(['error']) continue return suppliesList
# 获取客户信息 def getClientsInfo(clientsCount,firmID,kv):
clientsList = [] for page in range(math.ceil(int(clientsCount)/10)): data = {'ps':'10','pn':1+page,'id':firmID, '_':''} url = "https://www.tianyancha.com/pagination/clients.xhtml?" try: s = requests.session() r = s.get(url, params = data, headers=kv) r.raise_for_status() r.encoding = r.apparent_encoding demo = r.text soup = BeautifulSoup(demo, "html.parser") for tr in soup.find("tbody").children: if isinstance(tr, bs4.element.Tag): tds = tr("td") clientsList.append([tds[-3].string, tr.find('img').attrs['alt'], tds[-5].string, tds[-2].string])
except: clientsList.append(['error']) continue return clientsList
# 输出到excel表格 def outputInfo(stock,suppliesList,clientsList):
stock = np.array(stock).reshape(1,2) firm = pd.DataFrame(stock, columns=['企业名称','A股代码']) supplies = pd.DataFrame(suppliesList, columns=[ '报告期', '供应商', '采购占比', '数据来源']) clients = pd.DataFrame(clientsList, columns=['报告期', '客户', '销售占比', '数据来源']) firmRelations = pd.concat([firm,supplies,clients],axis=1,join='outer') # 直接拼接,后续得改动 firmRelations.to_excel('D:\\supplychain\\'+ stock[0][1] + '.xls', index=False)
def main():
kv = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36' , 'Cookie':'aliyungf_tc=AQAAAJIUFxrQegEAO8rUdrLUnz2Nd11s; csrfToken=V67xMxoZlIGNXtXKevDrXdQX; jsid=SEO-BAIDU-ALL-SY-000001; TYCID=e75a3c40465011eaa050b3c52801358f; undefined=e75a3c40465011eaa050b3c52801358f; ssuid=3863879260; bannerFlag=undefined; _ga=GA1.2.1263231458.1580712424; _gid=GA1.2.1540477573.1580712424; _gat_gtag_UA_123487620_1=1; tyc-user-info=%257B%2522claimEditPoint%2522%253A%25220%2522%252C%2522myAnswerCount%2522%253A%25220%2522%252C%2522myQuestionCount%2522%253A%25220%2522%252C%2522signUp%2522%253A%25220%2522%252C%2522explainPoint%2522%253A%25220%2522%252C%2522privateMessagePointWeb%2522%253A%25220%2522%252C%2522nickname%2522%253A%2522%25E5%25AE%258C%25E9%25A2%259C%25E9%2598%25BF%25E9%25AA%25A8%25E6%2589%2593%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522privateMessagePoint%2522%253A%25220%2522%252C%2522state%2522%253A%25220%2522%252C%2522announcementPoint%2522%253A%25220%2522%252C%2522isClaim%2522%253A%25220%2522%252C%2522bidSubscribe%2522%253A%2522-1%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522discussCommendCount%2522%253A%25220%2522%252C%2522monitorUnreadCount%2522%253A%25220%2522%252C%2522onum%2522%253A%25220%2522%252C%2522claimPoint%2522%253A%25220%2522%252C%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzM1NDMwMDk3NiIsImlhdCI6MTU4MDcxMjQzMSwiZXhwIjoxNjEyMjQ4NDMxfQ.B_pZhhv3tH6P0XZU2oG8Xjsmd1y45t0OCxLHVCJqfUjvnNlE-XTN9xLLJwFref8oMNWRvSqtbWmnhh57bT1aLg%2522%252C%2522pleaseAnswerCount%2522%253A%25220%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522bizCardUnread%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252217354300976%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzM1NDMwMDk3NiIsImlhdCI6MTU4MDcxMjQzMSwiZXhwIjoxNjEyMjQ4NDMxfQ.B_pZhhv3tH6P0XZU2oG8Xjsmd1y45t0OCxLHVCJqfUjvnNlE-XTN9xLLJwFref8oMNWRvSqtbWmnhh57bT1aLg; tyc-user-phone=%255B%252217354300976%2522%255D; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1580712434; RTYCID=e69cc442f73340f581eacc4cef5ceb7d; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1580712469' } df = pd.read_excel(r'D:\Astock_name_url_ID.xls',converters={'A_StkCd':str}) IDhrefList = df['url'].tolist() firmID = df['firmID'].tolist() stockList,relativeCount = getCompanyInfo(IDhrefList) for i in range(len(firmID)): if len(relativeCount[i]) != 1: suppliesList = getSuppliesInfo(relativeCount[i][0],firmID[i],kv) clientsList = getClientsInfo(relativeCount[i][1],firmID[i],kv) outputInfo(stockList[i],suppliesList,clientsList)
# 运行程序 main()
codes/tianyancha_crawler.txt · 最后更改: 2023/11/10 12:13 由 127.0.0.1