用户工具

站点工具


codes:tianyancha_crawler

爬取天眼查供应商信息

import requests from bs4 import BeautifulSoup import bs4 import traceback import numpy as np import pandas as pd import time import math

# 通用代码框架 def getHTMLText(url,kv):

  try:
      r = requests.get(url,headers=kv,timeout=30)
      r.raise_for_status()     # 如果状态不是200,引发HTTPError异常
      r.encoding = r.apparent_encoding
      return r.text
  except:
      return "产生异常"

Part 1 获得公司在天眼查的url链接,保存到excel中 # 获得公司url链接id def getfirmID(fullName):

  IDhrefList = []
  for fname in fullName:                       # fullName从锐思数据库中下载
      url = 'https://www.tianyancha.com/search?key='
      firmIdURL = url + fname
      try:
          demo = getHTMLText(firmIdURL, kv)
          soup = BeautifulSoup(demo, "html.parser")
          a = soup.find_all('a')[1]
          IDhrefList.append(a.attrs['href'])
      except:
          IDhrefList.append(['error'])
          continue
  return IDhrefList 

df = pd.read_excel(r'D:\Astock_name.xls',converters={'A_StkCd':str}) fullName = df['LComNm'].tolist()

# 获得公司证券代码和供应商客户数量 def getCompanyInfo(IDhrefList):

  stockList = []
  relativeCount = []
  for companyURL in IDhrefList:
      try:
          demo = getHTMLText(companyURL,kv)
          soup = BeautifulSoup(demo, "html.parser")
          # 获取公司简称和证券代码
          shortName = soup.find_all('span', attrs={'class':'short-name'})
          stkcd = soup.find_all('span', attrs={'class':'bond-nam'})
          # 获取供应商和客户总数
          scount = soup.find('div', attrs={'id':'nav-main-suppliesCount'}).find_all('span')[1].string if soup.find_all('div', attrs={'id':'nav-main-suppliesCount'}) != [] else 0
          ccount = soup.find('div', attrs={'id':'nav-main-clientsCount'}).find_all('span')[1].string  if soup.find_all('div', attrs={'id':'nav-main-clientsCount'}) != [] else 0
      
          stockList.append([shortName[0].string, stkcd[0].string])
          relativeCount.append([scount, ccount])
      except:
          stockList.append(['error'])
          relativeCount.append(['error'])
          continue
  return stockList,relativeCount

# 获得供应商信息 def getSuppliesInfo(supplilesCount,firmID,kv):

  suppliesList = []
  for page in range(math.ceil(int(supplilesCount)/10)):
      # time.sleep(2)
      data = {'ps':'10','pn':1+page,'id':firmID, '_':''} 
      url = "https://www.tianyancha.com/pagination/supplies.xhtml?"  
      try:
          s  = requests.session()
          r = s.get(url, params = data, headers=kv)   
          r.raise_for_status()
          r.encoding = r.apparent_encoding
          demo = r.text
          
          soup = BeautifulSoup(demo, "html.parser")
          for tr in soup.find("tbody").children:
              if isinstance(tr, bs4.element.Tag):
                  tds = tr("td")
                  suppliesList.append([tds[-3].string, tr.find('img').attrs['alt'], tds[-5].string, tds[-2].string])   
                  
      except:
          suppliesList.append(['error'])
          continue
  return suppliesList

# 获取客户信息 def getClientsInfo(clientsCount,firmID,kv):

  clientsList = []
  for page in range(math.ceil(int(clientsCount)/10)):
      data = {'ps':'10','pn':1+page,'id':firmID, '_':''} 
      url = "https://www.tianyancha.com/pagination/clients.xhtml?"  
      try:
          s  = requests.session()
          r = s.get(url, params = data, headers=kv) 
          r.raise_for_status()
          r.encoding = r.apparent_encoding
          demo = r.text
          
          soup = BeautifulSoup(demo, "html.parser")
          for tr in soup.find("tbody").children:
              if isinstance(tr, bs4.element.Tag):
                  tds = tr("td")
                  clientsList.append([tds[-3].string, tr.find('img').attrs['alt'], tds[-5].string, tds[-2].string])
      except:
          clientsList.append(['error'])
          continue
  return clientsList

# 输出到excel表格 def outputInfo(stock,suppliesList,clientsList):

  stock = np.array(stock).reshape(1,2)
  firm = pd.DataFrame(stock, columns=['企业名称','A股代码']) 
  supplies = pd.DataFrame(suppliesList, columns=[ '报告期', '供应商', '采购占比', '数据来源']) 
  clients = pd.DataFrame(clientsList, columns=['报告期', '客户', '销售占比', '数据来源'])
  firmRelations = pd.concat([firm,supplies,clients],axis=1,join='outer')      # 直接拼接,后续得改动
  firmRelations.to_excel('D:\\supplychain\\'+ stock[0][1] + '.xls', index=False)

def main():

  kv = {
      'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36' ,
      'Cookie':'aliyungf_tc=AQAAAJIUFxrQegEAO8rUdrLUnz2Nd11s; csrfToken=V67xMxoZlIGNXtXKevDrXdQX; jsid=SEO-BAIDU-ALL-SY-000001; TYCID=e75a3c40465011eaa050b3c52801358f; undefined=e75a3c40465011eaa050b3c52801358f; ssuid=3863879260; bannerFlag=undefined; _ga=GA1.2.1263231458.1580712424; _gid=GA1.2.1540477573.1580712424; _gat_gtag_UA_123487620_1=1; tyc-user-info=%257B%2522claimEditPoint%2522%253A%25220%2522%252C%2522myAnswerCount%2522%253A%25220%2522%252C%2522myQuestionCount%2522%253A%25220%2522%252C%2522signUp%2522%253A%25220%2522%252C%2522explainPoint%2522%253A%25220%2522%252C%2522privateMessagePointWeb%2522%253A%25220%2522%252C%2522nickname%2522%253A%2522%25E5%25AE%258C%25E9%25A2%259C%25E9%2598%25BF%25E9%25AA%25A8%25E6%2589%2593%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522privateMessagePoint%2522%253A%25220%2522%252C%2522state%2522%253A%25220%2522%252C%2522announcementPoint%2522%253A%25220%2522%252C%2522isClaim%2522%253A%25220%2522%252C%2522bidSubscribe%2522%253A%2522-1%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522discussCommendCount%2522%253A%25220%2522%252C%2522monitorUnreadCount%2522%253A%25220%2522%252C%2522onum%2522%253A%25220%2522%252C%2522claimPoint%2522%253A%25220%2522%252C%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzM1NDMwMDk3NiIsImlhdCI6MTU4MDcxMjQzMSwiZXhwIjoxNjEyMjQ4NDMxfQ.B_pZhhv3tH6P0XZU2oG8Xjsmd1y45t0OCxLHVCJqfUjvnNlE-XTN9xLLJwFref8oMNWRvSqtbWmnhh57bT1aLg%2522%252C%2522pleaseAnswerCount%2522%253A%25220%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522bizCardUnread%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252217354300976%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzM1NDMwMDk3NiIsImlhdCI6MTU4MDcxMjQzMSwiZXhwIjoxNjEyMjQ4NDMxfQ.B_pZhhv3tH6P0XZU2oG8Xjsmd1y45t0OCxLHVCJqfUjvnNlE-XTN9xLLJwFref8oMNWRvSqtbWmnhh57bT1aLg; tyc-user-phone=%255B%252217354300976%2522%255D; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1580712434; RTYCID=e69cc442f73340f581eacc4cef5ceb7d; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1580712469'
      } 
  df = pd.read_excel(r'D:\Astock_name_url_ID.xls',converters={'A_StkCd':str})
  IDhrefList = df['url'].tolist()
  firmID = df['firmID'].tolist()
  stockList,relativeCount = getCompanyInfo(IDhrefList)
  for i in range(len(firmID)):
      if len(relativeCount[i]) != 1:
          suppliesList = getSuppliesInfo(relativeCount[i][0],firmID[i],kv)
          clientsList = getClientsInfo(relativeCount[i][1],firmID[i],kv)
          outputInfo(stockList[i],suppliesList,clientsList)

# 运行程序 main()

codes/tianyancha_crawler.txt · 最后更改: 2023/11/10 12:13 由 127.0.0.1

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki