学生用操作手册|Python 版量化多因子策略

(FF5 思想、A 股、CSMAR 原始 CSV → 因子工程 → 研究评估 → Backtrader 回测)

本手册对应本周课堂的“Python 从 0 到策略与回测”实践,完整复刻上周 Excel/WPS 的步骤与口径,可与课堂 Notebook《FF5_Multifactor_Backtrader.ipynb》配套使用。你将从四张 CSMAR 原始 CSV 开始,完成数据清洗、财务对齐、横截面因子工程(winsor→秩正态→规模中性化→合成)、研究评估(Rank‑IC、五分位)以及 Backtrader 日频回测。

为保证鲁棒性,本手册每一步都强调缺失值、非数值和极端值的防护;同时不对“股票数量”和“样本期”做人为限制,程序会尽可能使用全部可用数据。


0. 目标、数据与环境

代码:路径与显示设置

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 120)
pd.set_option('display.width', 180)

DATA_DIR = "./data"
PATH_FS_BS   = os.path.join(DATA_DIR, "FS_Combas.csv")
PATH_FS_IS   = os.path.join(DATA_DIR, "FS_Comins.csv")
PATH_MNTH    = os.path.join(DATA_DIR, "TRDNEW_Mnth.csv")
PATH_DAILY   = os.path.join(DATA_DIR, "TRDNEW_Dalyr.csv")

print("数据路径:", PATH_FS_BS, PATH_FS_IS, PATH_MNTH, PATH_DAILY, sep="\n")

1. 环境与内核准备(FactorEngineering)

代码:一键创建/注册内核并安装依赖

# 一键检查/创建 “FactorEngineering” 内核环境,并安装依赖
import os, sys, json, subprocess, shutil, platform
from pathlib import Path

ENV_NAME = "FactorEngineering"
DISPLAY_NAME = f"Python ({ENV_NAME})"
REQUIRED_PKGS = ["pandas", "numpy", "matplotlib", "backtrader", "ipykernel"]
PY_VERSION = "3.10"  # 可按需调整

def run(cmd, check=True, capture=False):
    print("$", " ".join(cmd))
    return subprocess.run(cmd, check=check, text=True, capture_output=capture)

def which(x): return shutil.which(x)

def list_kernels():
    try:
        from jupyter_client.kernelspec import KernelSpecManager
        return KernelSpecManager().find_kernel_specs()
    except Exception:
        p = run([sys.executable, "-m", "jupyter", "kernelspec", "list", "--json"], capture=True)
        return json.loads(p.stdout).get("kernelspecs", {})

def kernel_exists(name): return name in list_kernels()

def conda_available():
    for name in ["mamba", "conda"]:
        p = which(name)
        if p: return name, p
    return None, None

def conda_env_exists(manager_cmd, env_name):
    try:
        p = run([manager_cmd, "env", "list", "--json"], capture=True)
        envs = json.loads(p.stdout).get("envs", [])
        return any(path.endswith(os.sep + env_name) or f"{os.sep}envs{os.sep}{env_name}" in path for path in envs)
    except Exception:
        p = run([manager_cmd, "env", "list"], capture=True)
        return any(f"{os.sep}{env_name}" in line for line in p.stdout.splitlines())

def conda_run(manager_cmd, env_name, args): return run([manager_cmd, "run", "-n", env_name] + args)
def venv_python_path(venv_dir: Path):
    return str(venv_dir / ("Scripts" if platform.system().lower().startswith("win") else "bin") / ("python.exe" if platform.system().lower().startswith("win") else "python"))

def ensure_packages_in_env(python_exe, packages):
    run([python_exe, "-m", "pip", "install", "--upgrade", "pip"])
    run([python_exe, "-m", "pip", "install", "-U"] + packages)

def main():
    print(f"目标环境:{ENV_NAME} / {DISPLAY_NAME}")
    if kernel_exists(ENV_NAME):
        print(f"✓ 已存在内核:{DISPLAY_NAME}。请切换 Kernel → Change Kernel。")
        return

    mgr_name, mgr_path = conda_available()
    if mgr_name:
        print(f"检测到 {mgr_name},创建 Conda 环境:{ENV_NAME}")
        if not conda_env_exists(mgr_path, ENV_NAME):
            run([mgr_path, "create", "-y", "-n", ENV_NAME, f"python={PY_VERSION}"])
        else:
            print("✓ Conda 环境已存在")

        conda_run(mgr_path, ENV_NAME, ["python", "-m", "pip", "install", "-U"] + REQUIRED_PKGS)
        conda_run(mgr_path, ENV_NAME, ["python", "-m", "ipykernel", "install", "--user", "--name", ENV_NAME, "--display-name", DISPLAY_NAME])
        print(f"✓ 已注册内核:{DISPLAY_NAME}。切换后继续。")
        return

    print("未检测到 conda/mamba,改用 venv")
    venv_dir = Path.home() / ".virtualenvs" / ENV_NAME
    venv_dir.parent.mkdir(parents=True, exist_ok=True)
    if not venv_dir.exists():
        run([sys.executable, "-m", "venv", str(venv_dir)])
    pyexe = venv_python_path(venv_dir)
    ensure_packages_in_env(pyexe, REQUIRED_PKGS)
    run([pyexe, "-m", "ipykernel", "install", "--user", "--name", ENV_NAME, "--display-name", DISPLAY_NAME])
    print(f"✓ 已注册内核:{DISPLAY_NAME}。切换后继续。")

main()

2. 读取与清洗四张 CSV(A 股过滤、年度口径、AG 计算)

代码:读取与清洗

from datetime import datetime

def to_str_code(x):
    try:
        s = str(x).strip().split('.')[0]
        return s.zfill(6)
    except Exception:
        return np.nan

def parse_date_any(x):
    if pd.isna(x): return pd.NaT
    s = str(x).strip()
    for fmt in ("%Y-%m-%d", "%Y%m%d", "%Y-%m", "%Y/%m/%d"):
        try: return pd.to_datetime(s, format=fmt)
        except Exception: pass
    try: return pd.to_datetime(s)
    except Exception: return pd.NaT

# 月度
mn = pd.read_csv(PATH_MNTH, dtype=str)
mn['Stkcd']  = mn['Stkcd'].apply(to_str_code)
mn['Trdmnt'] = mn['Trdmnt'].apply(parse_date_any)
mn['Trdmnt'] = mn['Trdmnt'].dt.to_period('M').dt.to_timestamp('M')
for col in ['Mretwd','Msmvttl','Mclsprc','Markettype']:
    if col in mn.columns:
        mn[col] = pd.to_numeric(mn[col], errors='coerce')
if 'Markettype' in mn.columns:
    mn['Markettype'] = mn['Markettype'].astype('Int64')
    mn = mn[mn['Markettype'].isin({1,4,16,32,64})]
mn = mn.dropna(subset=['Stkcd','Trdmnt']).sort_values(['Stkcd','Trdmnt']).reset_index(drop=True)
print("月度记录数:", len(mn))

# 年度财务:BS + IS(合并口径 A)
bs = pd.read_csv(PATH_FS_BS, dtype=str)
is_ = pd.read_csv(PATH_FS_IS, dtype=str)
for df in (bs, is_):
    df['Stkcd']  = df['Stkcd'].apply(to_str_code)
    df['Accper'] = df['Accper'].apply(parse_date_any)
    df['Typrep'] = df['Typrep'].astype(str).str.upper().str.strip()

bs = bs[bs['Typrep']=='A'].copy()
is_ = is_[is_['Typrep']=='A'].copy()
bs = bs.rename(columns={'A001000000':'TotalAssets','A003100000':'BookEquity'})
is_ = is_.rename(columns={'B002000101':'NetIncome'})

for col in ['TotalAssets','BookEquity']:
    bs[col] = pd.to_numeric(bs[col], errors='coerce')
is_['NetIncome'] = pd.to_numeric(is_['NetIncome'], errors='coerce')

bs['FiscalYear'] = bs['Accper'].dt.year
is_['FiscalYear'] = is_['Accper'].dt.year

bs = (bs.sort_values(['Stkcd','FiscalYear','Accper'])
        .drop_duplicates(['Stkcd','FiscalYear'], keep='last')
        .reset_index(drop=True))
is_ = (is_.sort_values(['Stkcd','FiscalYear','Accper'])
        .drop_duplicates(['Stkcd','FiscalYear'], keep='last')
        .reset_index(drop=True))

fin = (pd.merge(
    bs[['Stkcd','FiscalYear','TotalAssets','BookEquity']],
    is_[['Stkcd','FiscalYear','NetIncome']],
    on=['Stkcd','FiscalYear'], how='outer')
    .sort_values(['Stkcd','FiscalYear'])
    .reset_index(drop=True))

fin['AG'] = fin.groupby('Stkcd')['TotalAssets'].pct_change()
fin = fin.dropna(subset=['Stkcd','FiscalYear']).reset_index(drop=True)

print("年度财务记录数:", len(fin))
display(fin.head(3))

3. 年度财务映射到月度(避免前视)

代码:映射与派生变量

mn['Year_Fin'] = mn['Trdmnt'].dt.year - 1
mn = pd.merge(mn, fin.rename(columns={'FiscalYear':'Year_Fin'}),
              on=['Stkcd','Year_Fin'], how='left', suffixes=('',''))

mcap = pd.to_numeric(mn['Msmvttl'], errors='coerce')
mn['LogMktCap'] = np.log(mcap.where(mcap > 0))
mn['Ret_t1'] = mn.groupby('Stkcd')['Mretwd'].shift(-1)

mn = mn.sort_values(['Stkcd','Trdmnt']).reset_index(drop=True)
mn[['Stkcd','Trdmnt','Year_Fin','LogMktCap','Ret_t1']].head(3)

4. 构造原始描述子(BM、ROE、AG_pos)

代码:描述子构造

mn['BM']     = pd.to_numeric(mn['BookEquity'], errors='coerce') / pd.to_numeric(mn['Msmvttl'], errors='coerce')
mn['ROE']    = pd.to_numeric(mn['NetIncome'],  errors='coerce') / pd.to_numeric(mn['BookEquity'], errors='coerce')
mn['AG_pos'] = - pd.to_numeric(mn['AG'], errors='coerce')

for col in ['BM','ROE','AG_pos','Ret_t1','LogMktCap']:
    mn[col] = pd.to_numeric(mn[col], errors='coerce').replace([np.inf,-np.inf], np.nan)

mn['Industry'] = 'All'  # 有行业映射时在此 merge
mn[['Stkcd','Trdmnt','BM','ROE','AG_pos','Industry']].head(3)

5. 行业内 Winsor(1/99)与秩→正态分位

代码:去极值与正态化

import math

def clip_winsor(x: pd.Series, p_low=0.01, p_high=0.99):
    x = pd.to_numeric(x, errors="coerce")
    if x.notna().sum() < 5: return x
    lo, hi = x.quantile(p_low), x.quantile(p_high)
    return x.clip(lower=lo, upper=hi)

# Acklam 近似的标准正态分位函数
def norm_s_inv_approx(p):
    if p <= 0.0: return -np.inf
    if p >= 1.0: return np.inf
    a=[-3.969683028665376e+01,2.209460984245205e+02,-2.759285104469687e+02,1.383577518672690e+02,-3.066479806614716e+01,2.506628277459239e+00]
    b=[-5.447609879822406e+01,1.615858368580409e+02,-1.556989798598866e+02,6.680131188771972e+01,-1.328068155288572e+01]
    c=[-7.784894002430293e-03,-3.223964580411365e-01,-2.400758277161838e+00,-2.549732539343734e+00,4.374664141464968e+00,2.938163982698783e+00]
    d=[7.784695709041462e-03,3.224671290700398e-01,2.445134137142996e+00,3.754408661907416e+00]
    plow=0.02425; phigh=1-plow
    if p<plow:
        q=math.sqrt(-2*math.log(p))
        return (((((c[0]*q+c[1])*q+c[2])*q+c[3])*q+c[4])*q+c[5])/((((d[0]*q+d[1])*q+d[2])*q+d[3])*q+1)
    if p>phigh:
        q=math.sqrt(-2*math.log(1-p))
        return -(((((c[0]*q+c[1])*q+c[2])*q+c[3])*q+c[4])*q+c[5])/((((d[0]*q+d[1])*q+d[2])*q+d[3])*q+1)
    q=p-0.5; r=q*q
    return (((((a[0]*r+a[1])*r+a[2])*r+a[3])*r+a[4])*r+a[5])*q/((((b[0]*r+b[1])*r+b[2])*r+b[3])*r+b[4])*r+1)

def rank_to_normal(x: pd.Series):
    x = pd.to_numeric(x, errors='coerce')
    s = x.dropna()
    if s.empty: return pd.Series(index=x.index, dtype=float)
    r = s.rank(method='average', ascending=True)
    u = (r - 0.5) / len(s)
    z = u.apply(norm_s_inv_approx)
    out = pd.Series(index=x.index, dtype=float)
    out.loc[s.index] = z
    return out

# 分组与变换
mn = mn.reset_index(drop=True)
grp = mn.groupby(['Trdmnt','Industry'], observed=True, sort=False)
factors = ['BM','ROE','AG_pos']

# Winsor
for col in factors:
    mn[col + '_win'] = grp[col].transform(lambda s: clip_winsor(s, 0.01, 0.99))

# 秩→正态
def rank_to_normal_full(s: pd.Series):
    z = rank_to_normal(s)
    return z.reindex(s.index)

for col in factors:
    mn['z_' + col] = grp[col + '_win'].transform(rank_to_normal_full)

mn[['Stkcd','Trdmnt','z_BM','z_ROE','z_AG_pos']].head(3)

6. 规模中性化(对数市值残差)

代码:规模中性化(稳健版)

def safe_cov_beta(z: pd.Series, s: pd.Series):
    z = pd.to_numeric(z, errors="coerce")
    s = pd.to_numeric(s, errors="coerce")
    valid = z.notna() & s.notna()
    if valid.sum() < 5: return np.nan
    zc = z[valid].to_numpy(); sc = s[valid].to_numpy()
    var = np.var(sc, ddof=1)
    if not np.isfinite(var) or var == 0: return np.nan
    cov = np.cov(zc, sc, ddof=1)[0,1]
    return cov / var

# 计算每组 beta 并映射残差
grp = mn.groupby(['Trdmnt','Industry'], observed=True, sort=False)

def beta_per_group(df, zcol):
    return safe_cov_beta(df[zcol], df['LogMktCap'])

for base in ['BM','ROE','AG_pos']:
    zcol = 'z_' + base
    outcol = zcol + '_perp'

    beta_by_group = grp.apply(lambda g: beta_per_group(g, zcol))
    beta_by_group.name = 'beta'

    keys = pd.MultiIndex.from_frame(mn[['Trdmnt','Industry']])
    beta_vec = beta_by_group.reindex(keys).to_numpy()

    z = pd.to_numeric(mn[zcol], errors='coerce')
    s = pd.to_numeric(mn['LogMktCap'], errors='coerce')
    mn[outcol] = z - beta_vec * s

mn[['Stkcd','Trdmnt','z_BM_perp','z_ROE_perp','z_AG_pos_perp']].head(3)

7. 合成综合分数与研究评估(Rank-IC、五分位)

代码:合成与评估

perps = ['z_BM_perp','z_ROE_perp','z_AG_pos_perp']
mn['Score'] = mn[perps].mean(axis=1, skipna=True)

def spearman_by_month(df):
    x = df['Score'].rank(method='average')
    y = df['Ret_t1'].rank(method='average')
    if x.notna().sum() >= 5 and y.notna().sum() >= 5:
        return x.corr(y)
    return np.nan

rank_ic = mn.groupby('Trdmnt').apply(spearman_by_month).rename('RankIC').to_frame()
rank_ic['RankIC_Mean'] = rank_ic['RankIC'].expanding().mean()

def bucket5_by_month(df):
    s = df['Score']
    n = s.notna().sum()
    if n < 5:
        return pd.Series(index=df.index, data=np.nan, name='Bucket')
    r = s.rank(method='first')
    b = 1 + ((r - 1) * 5 // n)
    return b.clip(1,5)

mn['Bucket'] = mn.groupby('Trdmnt', group_keys=False).apply(bucket5_by_month)

qret = (mn.dropna(subset=['Bucket','Ret_t1'])
          .groupby(['Trdmnt','Bucket'])['Ret_t1']
          .mean()
          .unstack('Bucket')
          .rename(columns=lambda k: f'Q{int(k)}'))

qret['LS'] = qret.get('Q5', np.nan) - qret.get('Q1', np.nan)
for c in qret.columns:
    qret[f'Cum_{c}'] = (1 + qret[c].fillna(0)).cumprod() - 1

display(rank_ic.tail())
display(qret[['Q1','Q2','Q3','Q4','Q5','LS']].tail())

import os
os.makedirs('./output', exist_ok=True)
rank_ic.to_csv('./output/rank_ic.csv')
qret.to_csv('./output/quintile_returns.csv')
mn[['Stkcd','Trdmnt','Score']].to_csv('./output/score_monthly.csv', index=False)
print('研究评估输出已保存至 ./output/')

8. 生成月度多空权重(Top20% vs Bottom20%)

代码:构建权重

def build_monthly_weights(df, long_gross=0.5, short_gross=-0.5, min_names=5):
    out = {}
    for dt, g in df.groupby('Trdmnt'):
        s = g.dropna(subset=['Score'])
        n = len(s)
        if n < max(min_names*2, 10):
            continue
        r = s['Score'].rank(method='first')
        k = max(int(round(n * 0.2)), min_names)
        long_names  = s.loc[r <= k, 'Stkcd'].tolist()
        short_names = s.loc[r > (n - k), 'Stkcd'].tolist()
        w = {}
        if len(long_names) > 0:
            w_long = long_gross / len(long_names)
            w.update({sid: w_long for sid in long_names})
        if len(short_names) > 0:
            w_short = short_gross / len(short_names)
            for sid in short_names:
                w[sid] = w.get(sid, 0.0) + w_short
        out[dt] = w
    weights = pd.DataFrame(out).T.sort_index()
    weights.index.name = 'RebalDate'
    weights = weights.fillna(0.0)
    return weights

weights = build_monthly_weights(mn[['Trdmnt','Stkcd','Score']])
print("权重表形状:", weights.shape)
display(weights.head())

weights.to_csv('./output/weights_monthly.csv')
print('权重表已保存 ./output/weights_monthly.csv')

9. 准备日频数据与“执行日”映射

代码:日频整理与执行日映射

daily = pd.read_csv(PATH_DAILY, dtype=str)
daily['Stkcd'] = daily['Stkcd'].apply(to_str_code)
daily['Trddt'] = daily['Trddt'].apply(parse_date_any)

for col in ['Opnprc','Hiprc','Loprc','Clsprc','Dnshrtrd']:
    if col in daily.columns:
        daily[col] = pd.to_numeric(daily[col], errors='coerce')

tickers_in_weights = set(weights.columns)
daily = daily[daily['Stkcd'].isin(tickers_in_weights)].copy()

daily_bt = (daily
    .rename(columns={'Trddt':'datetime','Opnprc':'open','Hiprc':'high','Loprc':'low','Clsprc':'close','Dnshrtrd':'volume'})
    .dropna(subset=['datetime','open','high','low','close'])
    .sort_values(['Stkcd','datetime'])
)

# 交易日历
all_days = pd.to_datetime(sorted(daily_bt['datetime'].dt.date.unique()))

def next_trading_day(dt):
    idx = all_days.searchsorted(pd.to_datetime(dt), side='right')
    if idx < len(all_days):
        return all_days[idx].date()
    return None

# 构造执行日 → 权重映射
exec_map = {}
for dt, row in weights.iterrows():
    nxt = next_trading_day(dt)
    if nxt is None:
        continue
    d = {k: float(v) for k, v in row.dropna().to_dict().items() if abs(v) > 1e-12}
    exec_map[nxt] = d

print("执行日数量:", len(exec_map))
print("示例执行日:", next(iter(exec_map)) if exec_map else None)

10. Backtrader 策略与回测

代码:回测执行

import backtrader as bt

class StampDutyCommission(bt.CommInfoBase):
    params = (('stamp_duty', 0.001), ('commission', 0.00025), ('percabs', True),)
    def _getcommission(self, size, price, pseudoexec):
        comm = abs(size) * price * self.p.commission
        if size < 0:  # 卖出收印花税
            comm += abs(size) * price * self.p.stamp_duty
        return comm

class MonthlyRebalance(bt.Strategy):
    params = dict(exec_map=None)
    def __init__(self):
        self.datas_by_ticker = {d._name: d for d in self.datas}
        self.exec_map = self.p.exec_map or {}
        self.exec_dates = set(self.exec_map.keys())
    def next_open(self):
        curdate = self.data.datetime.date(0)
        if curdate in self.exec_dates:
            target = self.exec_map[curdate]
            for d in self.datas:
                self.order_target_percent(d, target=0.0)
            for tk, w in target.items():
                d = self.datas_by_ticker.get(tk)
                if d is not None:
                    self.order_target_percent(d, target=float(w))

cerebro = bt.Cerebro(stdstats=False)
cerebro.broker.setcash(1_000_000)
cerebro.broker.set_coc(True)  # cheat-on-open
cerebro.broker.setcommission(commission=0.00025)
cerebro.broker.addcommissioninfo(StampDutyCommission())
try:
    cerebro.broker.set_slippage_perc(perc=0.0002)  # 2 bps
except Exception:
    pass

# 加载数据
for sid, g in daily_bt.groupby('Stkcd'):
    df = g[['datetime','open','high','low','close','volume']].copy()
    df['openinterest'] = 0
    df.set_index('datetime', inplace=True)
    datafeed = bt.feeds.PandasData(dataname=df, name=sid, timeframe=bt.TimeFrame.Days)
    cerebro.adddata(datafeed)

cerebro.addstrategy(MonthlyRebalance, exec_map=exec_map)
res = cerebro.run(maxcpus=1)
portval = cerebro.broker.getvalue()
print(f'期末资产净值: {portval:,.2f}')
# 可选:cerebro.plot(iplot=False)

11. 结果导出与复查

代码:导出与统计

mn[['Stkcd','Trdmnt','Score','Ret_t1']].to_csv('./output/monthly_score_with_nextret.csv', index=False)
weights.to_csv('./output/weights_monthly.csv')

print('导出完成:./output/monthly_score_with_nextret.csv, ./output/weights_monthly.csv')
print('样本期:', mn['Trdmnt'].min().date(), '→', mn['Trdmnt'].max().date())
print('股票数量(出现过的):', mn['Stkcd'].nunique())

12. 质量控制清单(提交前自检)


13. 常见问题与修复


14. 拓展作业与讨论


15. 文件结构建议

project_root/
├─ FF5_Multifactor_Backtrader.ipynb
├─ data/
│  ├─ FS_Combas.csv
│  ├─ FS_Comins.csv
│  ├─ TRDNEW_Mnth.csv
│  └─ TRDNEW_Dalyr.csv
└─ output/
   ├─ rank_ic.csv
   ├─ quintile_returns.csv
   ├─ score_monthly.csv
   └─ weights_monthly.csv

16. 关键公式与实现要点速查