使用BeautifulSoup爬取世界政府债券网站收益率表格时返回空表的问题求助
使用BeautifulSoup爬取世界政府债券网站收益率表格时返回空表的问题求助
我正在尝试从一个网站爬取多个国家不同期限的收益率表格,但目前只得到空表:

而预期的表格应该是这样的:

我目前的实现代码如下:
import time import datetime as dt import pandas as pd from bs4 import BeautifulSoup from dateutil.relativedelta import relativedelta import requests import re import os path = os.getcwd() def ZCCWord(Date,country): # Site URL url="http://www.worldgovernmentbonds.com/country/"+country html_content = requests.get(url).text soup = BeautifulSoup(html_content, "lxml") #gdp = soup.find_all("table", attrs={"class": "w3-table w3-white table-padding-custom w3 small font-family-arial table-valign-middle"}) gdp = soup.find_all("table") # , attrs={"class": "w3-table money pd44 -f15"}) table1 = gdp[0] body = table1.find_all("tr") body_rows = body[1:] all_rows = [] # will be a list for list for all rows for row_num in range(len(body_rows)): # A row at a time row = [] # this will hold entries for one row for row_item in body_rows[row_num].find_all("td"): #loop through all row entries aa = re.sub("(\xa0)|(\n)|,","",row_item.text) #append aa to row - note one row entry is being appended row.append(aa) # append one row to all_rows all_rows.append(row) AAA = pd.DataFrame(all_rows) ZCC = pd.DataFrame() ZCC = AAA[1].str.extract('([^a-zA-Z]+)([a-zA-Z]+)', expand=True).dropna().reset_index(drop=True) ZCC.columns = ['TENOR', 'PERIOD'] ZCC['TENOR'] = ZCC['TENOR'].str.strip().str.isdigit() # Remove leading/trailing spaces #ZCC = ZCC[ZCC['TENOR'].str.isdigit()] ZCC['TENOR'] = ZCC['TENOR'].astype(int) ZCC['RATES'] = AAA[2].str.extract(r'([0-9.]+)', expand=True).dropna().reset_index(drop=True).astype(float) ZCC['RATES'] = ZCC['RATES']/100 row2 = [] for i in range(len(ZCC)): if ZCC['PERIOD'][i]=='month' or ZCC['PERIOD'][i]=='months': b = ZCC['TENOR'][i] bb = Date + relativedelta(months = b) row2.append(bb) else: b = ZCC['TENOR'][i] bb = Date + relativedelta(years = b) row2.append(bb) ZCC['DATES'] = pd.DataFrame(row2) ZCC = ZCC.reindex(['TENOR','PERIOD','DATES','RATES'], axis=1) return ZCC LitsCountries = ['spain','portugal','latvia','ireland','united-kingdom', 'germany', 'france','italy','sweden','finland','greece', 'poland','romania','hungary','netherlands'] todays_date = path+'\\WorldYields' +str(dt.datetime.now().strftime("%Y-%m-%d-%H-%M") )+ '.xlsx' writer = pd.ExcelWriter(todays_date, engine='xlsxwriter',engine_kwargs={'options':{'strings_to_urls': False}}) dictYield = {} for i in range(len(LitsCountries)): country = LitsCountries[i] Date = pd.to_datetime('today').date() country = LitsCountries[i] ZCC = ZCCWord(Date,country) dictYield[i] = ZCC ZCC.to_excel(writer, sheet_name=country) writer.close() time.sleep(60) # wait one minute
我也可以接受其他能提供类似输出的网站、解决方案或方法,请问有没有解决思路?
提前感谢!
备注:内容来源于stack exchange,提问作者Luca91




