本手册对应本周课堂的“Python 从 0 到策略与回测”实践,完整复刻上周 Excel/WPS 的步骤与口径,可与课堂 Notebook《FF5_Multifactor_Backtrader.ipynb》配套使用。你将从四张 CSMAR 原始 CSV 开始,完成数据清洗、财务对齐、横截面因子工程(winsor→秩正态→规模中性化→合成)、研究评估(Rank‑IC、五分位)以及 Backtrader 日频回测。
为保证鲁棒性,本手册每一步都强调缺失值、非数值和极端值的防护;同时不对“股票数量”和“样本期”做人为限制,程序会尽可能使用全部可用数据。
./data/
)
FS_Combas.csv
(合并口径,资产负债表)FS_Comins.csv
(合并口径,利润表)TRDNEW_Mnth.csv
(月度收益与市值)TRDNEW_Dalyr.csv
(日频 OHLCV)pandas
、numpy
、matplotlib
、backtrader
、ipykernel
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 120)
pd.set_option('display.width', 180)
DATA_DIR = "./data"
PATH_FS_BS = os.path.join(DATA_DIR, "FS_Combas.csv")
PATH_FS_IS = os.path.join(DATA_DIR, "FS_Comins.csv")
PATH_MNTH = os.path.join(DATA_DIR, "TRDNEW_Mnth.csv")
PATH_DAILY = os.path.join(DATA_DIR, "TRDNEW_Dalyr.csv")
print("数据路径:", PATH_FS_BS, PATH_FS_IS, PATH_MNTH, PATH_DAILY, sep="\n")
pandas
、numpy
、matplotlib
、backtrader
、ipykernel
。# 一键检查/创建 “FactorEngineering” 内核环境,并安装依赖
import os, sys, json, subprocess, shutil, platform
from pathlib import Path
ENV_NAME = "FactorEngineering"
DISPLAY_NAME = f"Python ({ENV_NAME})"
REQUIRED_PKGS = ["pandas", "numpy", "matplotlib", "backtrader", "ipykernel"]
PY_VERSION = "3.10" # 可按需调整
def run(cmd, check=True, capture=False):
print("$", " ".join(cmd))
return subprocess.run(cmd, check=check, text=True, capture_output=capture)
def which(x): return shutil.which(x)
def list_kernels():
try:
from jupyter_client.kernelspec import KernelSpecManager
return KernelSpecManager().find_kernel_specs()
except Exception:
p = run([sys.executable, "-m", "jupyter", "kernelspec", "list", "--json"], capture=True)
return json.loads(p.stdout).get("kernelspecs", {})
def kernel_exists(name): return name in list_kernels()
def conda_available():
for name in ["mamba", "conda"]:
p = which(name)
if p: return name, p
return None, None
def conda_env_exists(manager_cmd, env_name):
try:
p = run([manager_cmd, "env", "list", "--json"], capture=True)
envs = json.loads(p.stdout).get("envs", [])
return any(path.endswith(os.sep + env_name) or f"{os.sep}envs{os.sep}{env_name}" in path for path in envs)
except Exception:
p = run([manager_cmd, "env", "list"], capture=True)
return any(f"{os.sep}{env_name}" in line for line in p.stdout.splitlines())
def conda_run(manager_cmd, env_name, args): return run([manager_cmd, "run", "-n", env_name] + args)
def venv_python_path(venv_dir: Path):
return str(venv_dir / ("Scripts" if platform.system().lower().startswith("win") else "bin") / ("python.exe" if platform.system().lower().startswith("win") else "python"))
def ensure_packages_in_env(python_exe, packages):
run([python_exe, "-m", "pip", "install", "--upgrade", "pip"])
run([python_exe, "-m", "pip", "install", "-U"] + packages)
def main():
print(f"目标环境:{ENV_NAME} / {DISPLAY_NAME}")
if kernel_exists(ENV_NAME):
print(f"✓ 已存在内核:{DISPLAY_NAME}。请切换 Kernel → Change Kernel。")
return
mgr_name, mgr_path = conda_available()
if mgr_name:
print(f"检测到 {mgr_name},创建 Conda 环境:{ENV_NAME}")
if not conda_env_exists(mgr_path, ENV_NAME):
run([mgr_path, "create", "-y", "-n", ENV_NAME, f"python={PY_VERSION}"])
else:
print("✓ Conda 环境已存在")
conda_run(mgr_path, ENV_NAME, ["python", "-m", "pip", "install", "-U"] + REQUIRED_PKGS)
conda_run(mgr_path, ENV_NAME, ["python", "-m", "ipykernel", "install", "--user", "--name", ENV_NAME, "--display-name", DISPLAY_NAME])
print(f"✓ 已注册内核:{DISPLAY_NAME}。切换后继续。")
return
print("未检测到 conda/mamba,改用 venv")
venv_dir = Path.home() / ".virtualenvs" / ENV_NAME
venv_dir.parent.mkdir(parents=True, exist_ok=True)
if not venv_dir.exists():
run([sys.executable, "-m", "venv", str(venv_dir)])
pyexe = venv_python_path(venv_dir)
ensure_packages_in_env(pyexe, REQUIRED_PKGS)
run([pyexe, "-m", "ipykernel", "install", "--user", "--name", ENV_NAME, "--display-name", DISPLAY_NAME])
print(f"✓ 已注册内核:{DISPLAY_NAME}。切换后继续。")
main()
mn
Trdmnt
统一到月末日期fin
Typrep='A'
TotalAssets=A001000000
,BookEquity=A003100000
,NetIncome=B002000101
Accper
最大的一条AG = TotalAssets.pct_change()
按股票分组errors='coerce'
数值化;对齐后 reset_index(drop=True)
from datetime import datetime
def to_str_code(x):
try:
s = str(x).strip().split('.')[0]
return s.zfill(6)
except Exception:
return np.nan
def parse_date_any(x):
if pd.isna(x): return pd.NaT
s = str(x).strip()
for fmt in ("%Y-%m-%d", "%Y%m%d", "%Y-%m", "%Y/%m/%d"):
try: return pd.to_datetime(s, format=fmt)
except Exception: pass
try: return pd.to_datetime(s)
except Exception: return pd.NaT
# 月度
mn = pd.read_csv(PATH_MNTH, dtype=str)
mn['Stkcd'] = mn['Stkcd'].apply(to_str_code)
mn['Trdmnt'] = mn['Trdmnt'].apply(parse_date_any)
mn['Trdmnt'] = mn['Trdmnt'].dt.to_period('M').dt.to_timestamp('M')
for col in ['Mretwd','Msmvttl','Mclsprc','Markettype']:
if col in mn.columns:
mn[col] = pd.to_numeric(mn[col], errors='coerce')
if 'Markettype' in mn.columns:
mn['Markettype'] = mn['Markettype'].astype('Int64')
mn = mn[mn['Markettype'].isin({1,4,16,32,64})]
mn = mn.dropna(subset=['Stkcd','Trdmnt']).sort_values(['Stkcd','Trdmnt']).reset_index(drop=True)
print("月度记录数:", len(mn))
# 年度财务:BS + IS(合并口径 A)
bs = pd.read_csv(PATH_FS_BS, dtype=str)
is_ = pd.read_csv(PATH_FS_IS, dtype=str)
for df in (bs, is_):
df['Stkcd'] = df['Stkcd'].apply(to_str_code)
df['Accper'] = df['Accper'].apply(parse_date_any)
df['Typrep'] = df['Typrep'].astype(str).str.upper().str.strip()
bs = bs[bs['Typrep']=='A'].copy()
is_ = is_[is_['Typrep']=='A'].copy()
bs = bs.rename(columns={'A001000000':'TotalAssets','A003100000':'BookEquity'})
is_ = is_.rename(columns={'B002000101':'NetIncome'})
for col in ['TotalAssets','BookEquity']:
bs[col] = pd.to_numeric(bs[col], errors='coerce')
is_['NetIncome'] = pd.to_numeric(is_['NetIncome'], errors='coerce')
bs['FiscalYear'] = bs['Accper'].dt.year
is_['FiscalYear'] = is_['Accper'].dt.year
bs = (bs.sort_values(['Stkcd','FiscalYear','Accper'])
.drop_duplicates(['Stkcd','FiscalYear'], keep='last')
.reset_index(drop=True))
is_ = (is_.sort_values(['Stkcd','FiscalYear','Accper'])
.drop_duplicates(['Stkcd','FiscalYear'], keep='last')
.reset_index(drop=True))
fin = (pd.merge(
bs[['Stkcd','FiscalYear','TotalAssets','BookEquity']],
is_[['Stkcd','FiscalYear','NetIncome']],
on=['Stkcd','FiscalYear'], how='outer')
.sort_values(['Stkcd','FiscalYear'])
.reset_index(drop=True))
fin['AG'] = fin.groupby('Stkcd')['TotalAssets'].pct_change()
fin = fin.dropna(subset=['Stkcd','FiscalYear']).reset_index(drop=True)
print("年度财务记录数:", len(fin))
display(fin.head(3))
t
,只用上一年 t-1
年的财务数据LogMktCap = log(Msmvttl)
(正值)Ret_t1 = shift(-1)
按股票分组mn['Year_Fin'] = mn['Trdmnt'].dt.year - 1
mn = pd.merge(mn, fin.rename(columns={'FiscalYear':'Year_Fin'}),
on=['Stkcd','Year_Fin'], how='left', suffixes=('',''))
mcap = pd.to_numeric(mn['Msmvttl'], errors='coerce')
mn['LogMktCap'] = np.log(mcap.where(mcap > 0))
mn['Ret_t1'] = mn.groupby('Stkcd')['Mretwd'].shift(-1)
mn = mn.sort_values(['Stkcd','Trdmnt']).reset_index(drop=True)
mn[['Stkcd','Trdmnt','Year_Fin','LogMktCap','Ret_t1']].head(3)
'All'
(可替换为行业映射)mn['BM'] = pd.to_numeric(mn['BookEquity'], errors='coerce') / pd.to_numeric(mn['Msmvttl'], errors='coerce')
mn['ROE'] = pd.to_numeric(mn['NetIncome'], errors='coerce') / pd.to_numeric(mn['BookEquity'], errors='coerce')
mn['AG_pos'] = - pd.to_numeric(mn['AG'], errors='coerce')
for col in ['BM','ROE','AG_pos','Ret_t1','LogMktCap']:
mn[col] = pd.to_numeric(mn[col], errors='coerce').replace([np.inf,-np.inf], np.nan)
mn['Industry'] = 'All' # 有行业映射时在此 merge
mn[['Stkcd','Trdmnt','BM','ROE','AG_pos','Industry']].head(3)
Trdmnt × Industry
winsor
→ rank
→ 正态分位transform
确保与原行索引对齐import math
def clip_winsor(x: pd.Series, p_low=0.01, p_high=0.99):
x = pd.to_numeric(x, errors="coerce")
if x.notna().sum() < 5: return x
lo, hi = x.quantile(p_low), x.quantile(p_high)
return x.clip(lower=lo, upper=hi)
# Acklam 近似的标准正态分位函数
def norm_s_inv_approx(p):
if p <= 0.0: return -np.inf
if p >= 1.0: return np.inf
a=[-3.969683028665376e+01,2.209460984245205e+02,-2.759285104469687e+02,1.383577518672690e+02,-3.066479806614716e+01,2.506628277459239e+00]
b=[-5.447609879822406e+01,1.615858368580409e+02,-1.556989798598866e+02,6.680131188771972e+01,-1.328068155288572e+01]
c=[-7.784894002430293e-03,-3.223964580411365e-01,-2.400758277161838e+00,-2.549732539343734e+00,4.374664141464968e+00,2.938163982698783e+00]
d=[7.784695709041462e-03,3.224671290700398e-01,2.445134137142996e+00,3.754408661907416e+00]
plow=0.02425; phigh=1-plow
if p<plow:
q=math.sqrt(-2*math.log(p))
return (((((c[0]*q+c[1])*q+c[2])*q+c[3])*q+c[4])*q+c[5])/((((d[0]*q+d[1])*q+d[2])*q+d[3])*q+1)
if p>phigh:
q=math.sqrt(-2*math.log(1-p))
return -(((((c[0]*q+c[1])*q+c[2])*q+c[3])*q+c[4])*q+c[5])/((((d[0]*q+d[1])*q+d[2])*q+d[3])*q+1)
q=p-0.5; r=q*q
return (((((a[0]*r+a[1])*r+a[2])*r+a[3])*r+a[4])*r+a[5])*q/((((b[0]*r+b[1])*r+b[2])*r+b[3])*r+b[4])*r+1)
def rank_to_normal(x: pd.Series):
x = pd.to_numeric(x, errors='coerce')
s = x.dropna()
if s.empty: return pd.Series(index=x.index, dtype=float)
r = s.rank(method='average', ascending=True)
u = (r - 0.5) / len(s)
z = u.apply(norm_s_inv_approx)
out = pd.Series(index=x.index, dtype=float)
out.loc[s.index] = z
return out
# 分组与变换
mn = mn.reset_index(drop=True)
grp = mn.groupby(['Trdmnt','Industry'], observed=True, sort=False)
factors = ['BM','ROE','AG_pos']
# Winsor
for col in factors:
mn[col + '_win'] = grp[col].transform(lambda s: clip_winsor(s, 0.01, 0.99))
# 秩→正态
def rank_to_normal_full(s: pd.Series):
z = rank_to_normal(s)
return z.reindex(s.index)
for col in factors:
mn['z_' + col] = grp[col + '_win'].transform(rank_to_normal_full)
mn[['Stkcd','Trdmnt','z_BM','z_ROE','z_AG_pos']].head(3)
apply
赋值索引不兼容)def safe_cov_beta(z: pd.Series, s: pd.Series):
z = pd.to_numeric(z, errors="coerce")
s = pd.to_numeric(s, errors="coerce")
valid = z.notna() & s.notna()
if valid.sum() < 5: return np.nan
zc = z[valid].to_numpy(); sc = s[valid].to_numpy()
var = np.var(sc, ddof=1)
if not np.isfinite(var) or var == 0: return np.nan
cov = np.cov(zc, sc, ddof=1)[0,1]
return cov / var
# 计算每组 beta 并映射残差
grp = mn.groupby(['Trdmnt','Industry'], observed=True, sort=False)
def beta_per_group(df, zcol):
return safe_cov_beta(df[zcol], df['LogMktCap'])
for base in ['BM','ROE','AG_pos']:
zcol = 'z_' + base
outcol = zcol + '_perp'
beta_by_group = grp.apply(lambda g: beta_per_group(g, zcol))
beta_by_group.name = 'beta'
keys = pd.MultiIndex.from_frame(mn[['Trdmnt','Industry']])
beta_vec = beta_by_group.reindex(keys).to_numpy()
z = pd.to_numeric(mn[zcol], errors='coerce')
s = pd.to_numeric(mn['LogMktCap'], errors='coerce')
mn[outcol] = z - beta_vec * s
mn[['Stkcd','Trdmnt','z_BM_perp','z_ROE_perp','z_AG_pos_perp']].head(3)
Score = mean(z_BM_perp, z_ROE_perp, z_AG_pos_perp)
LS = Q5 - Q1
./output/rank_ic.csv
、./output/quintile_returns.csv
、./output/score_monthly.csv
perps = ['z_BM_perp','z_ROE_perp','z_AG_pos_perp']
mn['Score'] = mn[perps].mean(axis=1, skipna=True)
def spearman_by_month(df):
x = df['Score'].rank(method='average')
y = df['Ret_t1'].rank(method='average')
if x.notna().sum() >= 5 and y.notna().sum() >= 5:
return x.corr(y)
return np.nan
rank_ic = mn.groupby('Trdmnt').apply(spearman_by_month).rename('RankIC').to_frame()
rank_ic['RankIC_Mean'] = rank_ic['RankIC'].expanding().mean()
def bucket5_by_month(df):
s = df['Score']
n = s.notna().sum()
if n < 5:
return pd.Series(index=df.index, data=np.nan, name='Bucket')
r = s.rank(method='first')
b = 1 + ((r - 1) * 5 // n)
return b.clip(1,5)
mn['Bucket'] = mn.groupby('Trdmnt', group_keys=False).apply(bucket5_by_month)
qret = (mn.dropna(subset=['Bucket','Ret_t1'])
.groupby(['Trdmnt','Bucket'])['Ret_t1']
.mean()
.unstack('Bucket')
.rename(columns=lambda k: f'Q{int(k)}'))
qret['LS'] = qret.get('Q5', np.nan) - qret.get('Q1', np.nan)
for c in qret.columns:
qret[f'Cum_{c}'] = (1 + qret[c].fillna(0)).cumprod() - 1
display(rank_ic.tail())
display(qret[['Q1','Q2','Q3','Q4','Q5','LS']].tail())
import os
os.makedirs('./output', exist_ok=True)
rank_ic.to_csv('./output/rank_ic.csv')
qret.to_csv('./output/quintile_returns.csv')
mn[['Stkcd','Trdmnt','Score']].to_csv('./output/score_monthly.csv', index=False)
print('研究评估输出已保存至 ./output/')
./output/weights_monthly.csv
def build_monthly_weights(df, long_gross=0.5, short_gross=-0.5, min_names=5):
out = {}
for dt, g in df.groupby('Trdmnt'):
s = g.dropna(subset=['Score'])
n = len(s)
if n < max(min_names*2, 10):
continue
r = s['Score'].rank(method='first')
k = max(int(round(n * 0.2)), min_names)
long_names = s.loc[r <= k, 'Stkcd'].tolist()
short_names = s.loc[r > (n - k), 'Stkcd'].tolist()
w = {}
if len(long_names) > 0:
w_long = long_gross / len(long_names)
w.update({sid: w_long for sid in long_names})
if len(short_names) > 0:
w_short = short_gross / len(short_names)
for sid in short_names:
w[sid] = w.get(sid, 0.0) + w_short
out[dt] = w
weights = pd.DataFrame(out).T.sort_index()
weights.index.name = 'RebalDate'
weights = weights.fillna(0.0)
return weights
weights = build_monthly_weights(mn[['Trdmnt','Stkcd','Score']])
print("权重表形状:", weights.shape)
display(weights.head())
weights.to_csv('./output/weights_monthly.csv')
print('权重表已保存 ./output/weights_monthly.csv')
RebalDate
的下一个交易日(开盘调仓)daily = pd.read_csv(PATH_DAILY, dtype=str)
daily['Stkcd'] = daily['Stkcd'].apply(to_str_code)
daily['Trddt'] = daily['Trddt'].apply(parse_date_any)
for col in ['Opnprc','Hiprc','Loprc','Clsprc','Dnshrtrd']:
if col in daily.columns:
daily[col] = pd.to_numeric(daily[col], errors='coerce')
tickers_in_weights = set(weights.columns)
daily = daily[daily['Stkcd'].isin(tickers_in_weights)].copy()
daily_bt = (daily
.rename(columns={'Trddt':'datetime','Opnprc':'open','Hiprc':'high','Loprc':'low','Clsprc':'close','Dnshrtrd':'volume'})
.dropna(subset=['datetime','open','high','low','close'])
.sort_values(['Stkcd','datetime'])
)
# 交易日历
all_days = pd.to_datetime(sorted(daily_bt['datetime'].dt.date.unique()))
def next_trading_day(dt):
idx = all_days.searchsorted(pd.to_datetime(dt), side='right')
if idx < len(all_days):
return all_days[idx].date()
return None
# 构造执行日 → 权重映射
exec_map = {}
for dt, row in weights.iterrows():
nxt = next_trading_day(dt)
if nxt is None:
continue
d = {k: float(v) for k, v in row.dropna().to_dict().items() if abs(v) > 1e-12}
exec_map[nxt] = d
print("执行日数量:", len(exec_map))
print("示例执行日:", next(iter(exec_map)) if exec_map else None)
cheat-on-open
,在开盘前设置当日目标权重import backtrader as bt
class StampDutyCommission(bt.CommInfoBase):
params = (('stamp_duty', 0.001), ('commission', 0.00025), ('percabs', True),)
def _getcommission(self, size, price, pseudoexec):
comm = abs(size) * price * self.p.commission
if size < 0: # 卖出收印花税
comm += abs(size) * price * self.p.stamp_duty
return comm
class MonthlyRebalance(bt.Strategy):
params = dict(exec_map=None)
def __init__(self):
self.datas_by_ticker = {d._name: d for d in self.datas}
self.exec_map = self.p.exec_map or {}
self.exec_dates = set(self.exec_map.keys())
def next_open(self):
curdate = self.data.datetime.date(0)
if curdate in self.exec_dates:
target = self.exec_map[curdate]
for d in self.datas:
self.order_target_percent(d, target=0.0)
for tk, w in target.items():
d = self.datas_by_ticker.get(tk)
if d is not None:
self.order_target_percent(d, target=float(w))
cerebro = bt.Cerebro(stdstats=False)
cerebro.broker.setcash(1_000_000)
cerebro.broker.set_coc(True) # cheat-on-open
cerebro.broker.setcommission(commission=0.00025)
cerebro.broker.addcommissioninfo(StampDutyCommission())
try:
cerebro.broker.set_slippage_perc(perc=0.0002) # 2 bps
except Exception:
pass
# 加载数据
for sid, g in daily_bt.groupby('Stkcd'):
df = g[['datetime','open','high','low','close','volume']].copy()
df['openinterest'] = 0
df.set_index('datetime', inplace=True)
datafeed = bt.feeds.PandasData(dataname=df, name=sid, timeframe=bt.TimeFrame.Days)
cerebro.adddata(datafeed)
cerebro.addstrategy(MonthlyRebalance, exec_map=exec_map)
res = cerebro.run(maxcpus=1)
portval = cerebro.broker.getvalue()
print(f'期末资产净值: {portval:,.2f}')
# 可选:cerebro.plot(iplot=False)
./output/rank_ic.csv
、./output/quintile_returns.csv
./output/score_monthly.csv
、./output/weights_monthly.csv
mn[['Stkcd','Trdmnt','Score','Ret_t1']].to_csv('./output/monthly_score_with_nextret.csv', index=False)
weights.to_csv('./output/weights_monthly.csv')
print('导出完成:./output/monthly_score_with_nextret.csv, ./output/weights_monthly.csv')
print('样本期:', mn['Trdmnt'].min().date(), '→', mn['Trdmnt'].max().date())
print('股票数量(出现过的):', mn['Stkcd'].nunique())
Year_Fin = 年份(Trdmnt) - 1
Ret_t1 = groupby('Stkcd').shift(-1)
errors='coerce'
;±inf → NaN
LogMktCap
仅对正市值取对数reset_index(drop=True)
transform
;apply
后务必对齐索引再赋值LS
累计曲线groupby.apply
返回 MultiIndex 直接赋值transform
;或在赋值前将返回的 Series 索引降为原行索引并 reindex(mn.index)
beta_by_group
,再映射到行并残差化”的两步法(本手册第 6 节)set_slippage_fixed
或跳过滑点设置display(mn.head())
、display(fin.head())
快速检查字段与 NaN 分布project_root/
├─ FF5_Multifactor_Backtrader.ipynb
├─ data/
│ ├─ FS_Combas.csv
│ ├─ FS_Comins.csv
│ ├─ TRDNEW_Mnth.csv
│ └─ TRDNEW_Dalyr.csv
└─ output/
├─ rank_ic.csv
├─ quintile_returns.csv
├─ score_monthly.csv
└─ weights_monthly.csv
LS = Q5 - Q1