金融文本分析:金融新闻情感分析与市场预测¶

目标:通过分析金融新闻文本的情感,预测市场短期走势

数据:

  • 金融新闻文章(可使用Financial PhraseBank数据集)
  • 对应时间段的市场指数数据

分析流程:

  1. 环境设置与数据获取:
In [ ]:
# !pip install pandas numpy matplotlib scikit-learn gensim nltk transformers torch pyLDAvis  
# 以上安装命令可根据需要取消注释  

import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
from sklearn.model_selection import train_test_split  
import nltk  
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize  
from gensim import corpora, models  
import pyLDAvis.gensim_models  
from transformers import BertTokenizer, BertForSequenceClassification  
import torch  

# 下载必要的NLTK资源  
# nltk.download('punkt')  
# nltk.download('stopwords')  

# 数据加载示例  
news_data = pd.read_csv('financial_news.csv')  
market_data = pd.read_csv('market_index.csv')
  1. 文本预处理:
In [ ]:
def preprocess_text(text):  
    # 转小写  
    text = text.lower()  
    # 分词  
    tokens = word_tokenize(text)  
    # 去停用词  
    stop_words = set(stopwords.words('english'))  
    financial_stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'if', 'because', 'as', 'what',   
                           'company', 'stock', 'market', 'price', 'share'}  
    stop_words = stop_words.union(financial_stop_words)  
    filtered_tokens = [token for token in tokens if token.isalpha() and token not in stop_words]  
    return filtered_tokens  

# 应用预处理  
news_data['tokens'] = news_data['text'].apply(preprocess_text)
  1. LDA主题模型:
In [ ]:
# 创建词典  
dictionary = corpora.Dictionary(news_data['tokens'])  
# 过滤极端频率的词  
dictionary.filter_extremes(no_below=5, no_above=0.5)  

# 创建文档-词频矩阵  
corpus = [dictionary.doc2bow(doc) for doc in news_data['tokens']]  

# 训练LDA模型  
num_topics = 10  # 可以尝试不同的主题数  
lda_model = models.LdaModel(  
    corpus=corpus,  
    id2word=dictionary,  
    num_topics=num_topics,  
    passes=10,  
    alpha='auto',  
    eta='auto'  
)  

# 查看主题  
for idx, topic in lda_model.print_topics(-1):  
    print(f'Topic {idx}: {topic}')  

# 可视化LDA结果  
lda_vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)  
pyLDAvis.display(lda_vis)  

# 为每篇文章分配主题  
def get_dominant_topic(ldamodel, corpus, texts):  
    topic_probabilities = []  
    for i, row in enumerate(ldamodel[corpus]):  
        row = sorted(row, key=lambda x: x[1], reverse=True)  
        # 获取概率最高的主题  
        topic_num = row[0][0]  
        topic_prob = row[0][1]  
        topic_probabilities.append((topic_num, topic_prob))  
    return topic_probabilities  

topic_results = get_dominant_topic(lda_model, corpus, news_data['tokens'])  
news_data['dominant_topic'] = [result[0] for result in topic_results]  
news_data['topic_probability'] = [result[1] for result in topic_results]  

# 按日期聚合主题分布  
daily_topics = news_data.groupby(['date', 'dominant_topic']).size().unstack(fill_value=0)  
# 归一化,计算每日各主题占比  
daily_topics = daily_topics.div(daily_topics.sum(axis=1), axis=0)
  1. 情感分析:
In [ ]:
# 使用FinBERT进行情感分析  
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')  
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')  

def analyze_sentiment(text):  
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)  
    outputs = model(**inputs)  
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)  
    return probabilities.detach().numpy()[0]  # 返回正面、负面、中性概率  

# 应用到新闻样本(可以是部分样本,以节省计算时间)  
sample_size = min(500, len(news_data))  # 根据计算资源调整样本大小  
news_sample = news_data.sample(sample_size, random_state=42)  
news_sample['sentiment_scores'] = news_sample['text'].apply(analyze_sentiment)  

# 提取各情感分数  
news_sample['positive'] = news_sample['sentiment_scores'].apply(lambda x: x[0])  
news_sample['negative'] = news_sample['sentiment_scores'].apply(lambda x: x[1])  
news_sample['neutral'] = news_sample['sentiment_scores'].apply(lambda x: x[2])  

# 按日期聚合情感分数  
daily_sentiment = news_sample.groupby('date').agg({  
    'positive': 'mean',  
    'negative': 'mean',  
    'neutral': 'mean'  
}).reset_index()
  1. 主题-情感-市场关系分析:
In [ ]:
# 合并情感、主题和市场数据  
merged_data = pd.merge(daily_sentiment, daily_topics, on='date')  
merged_data = pd.merge(merged_data, market_data, on='date')  

# 创建预测目标:次日回报率  
merged_data['next_day_return'] = merged_data['market_index'].pct_change().shift(-1)  

# 移除缺失值  
merged_data = merged_data.dropna()  

# 特征工程  
X = merged_data[['positive', 'negative', 'neutral'] +   
               [f'topic_{i}' for i in range(num_topics)]]  # 情感和主题特征  
y = (merged_data['next_day_return'] > 0).astype(int)  # 二分类:上涨或下跌  

# 分割数据  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  

# 训练随机森林分类器  
from sklearn.ensemble import RandomForestClassifier  
model = RandomForestClassifier(n_estimators=100, random_state=42)  
model.fit(X_train, y_train)  

# 评估模型  
from sklearn.metrics import classification_report, confusion_matrix  
predictions = model.predict(X_test)  
print(classification_report(y_test, predictions))  

# 特征重要性分析  
feature_imp = pd.DataFrame(sorted(zip(model.feature_importances_, X.columns)),  
                           columns=['Value','Feature'])  
plt.figure(figsize=(10, 6))  
plt.barh(feature_imp['Feature'], feature_imp['Value'])  
plt.title('Feature Importance for Market Direction Prediction')  
plt.tight_layout()  
plt.show()
  1. 主题内容解释与可视化:
In [ ]:
# 可视化主题与市场走势  
plt.figure(figsize=(12, 8))  

# 绘制主要主题趋势  
top_topics = 3  # 选择最重要的几个主题  
top_topic_indices = feature_imp[feature_imp['Feature'].str.contains('topic')].nlargest(top_topics, 'Value')['Feature']  

for topic_col in top_topic_indices:  
    topic_idx = int(topic_col.split('_')[1])  
    plt.plot(merged_data['date'], merged_data[topic_col], label=f'Topic {topic_idx}')  

plt.title('Topic Trends Over Time')  
plt.legend()  
plt.subplot(2, 1, 2)  
plt.plot(merged_data['date'], merged_data['market_index'], 'k-')  
plt.title('Market Index')  

plt.tight_layout()  
plt.show()  

# 展示主题词云  
from wordcloud import WordCloud  

def plot_topic_wordcloud(lda_model, topic_id):  
    topic_words = dict(lda_model.show_topic(topic_id, 30))  
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(topic_words)  
    
    plt.figure(figsize=(10, 5))  
    plt.imshow(wordcloud, interpolation='bilinear')  
    plt.axis('off')  
    plt.title(f'Topic {topic_id} Word Cloud')  
    plt.show()  

# 为重要主题生成词云  
for topic_col in top_topic_indices:  
    topic_idx = int(topic_col.split('_')[1])  
    plot_topic_wordcloud(lda_model, topic_idx)

金融图像分析:金融图表模式识别与交易信号生成¶

目标:通过卷积神经网络识别股票K线图中的技术形态,生成交易信号

数据:

  • 股票历史K线图图像(可通过绘制或网络抓取获取)
  • 带有技术形态标注的图像数据集

分析流程:

  1. 数据获取与预处理:
In [ ]:
# %pip install tushare
In [ ]:
# %pip install opencv-python
In [ ]:
import numpy as np  
import pandas as pd  
import matplotlib.pyplot as plt  
import tushare as ts  
from PIL import Image  
import cv2  
# 设置Tushare API Token  
# ts.set_token('YOUR_TUSHARE_TOKEN')  
ts.set_token('f32443eb3d8addef6e131592ad68fa95b18c33725c3a52385330b892')  
pro = ts.pro_api()  

# 获取比亚迪股票数据  
def get_stock_data(start_date='20200101', end_date='20230101'):  
    # 获取日线数据  
    df = pro.daily(ts_code='002594.SZ', start_date=start_date, end_date=end_date)  
    
    # 数据预处理:  
    # 1. 按照日期排序(Tushare返回的数据默认是从新到旧)  
    df = df.sort_values('trade_date')  
    
    # 2. 重命名列以匹配原代码  
    df = df.rename(columns={  
        'trade_date': 'date',  
        'open': 'Open',  
        'high': 'High',  
        'low': 'Low',  
        'close': 'Close',  
        'vol': 'Volume'  
    })  
    
    # 3. 设置日期为索引并转换为日期格式  
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')  
    df.set_index('date', inplace=True)  
    
    return df  

# 获取比亚迪股票数据  
stock_data = get_stock_data()  
In [ ]:
# 生成K线图并保存为图像  
def generate_chart_image(data, window_size=50, output_path='chart.png'):  
    plt.figure(figsize=(10, 6))  
    # 绘制K线图  
    for i in range(len(data)-window_size, len(data)):  
        date = data.index[i]  
        op, hi, lo, cl = data.loc[date, ['Open', 'High', 'Low', 'Close']]  
        
        # 绿色表示收盘价高于开盘价,红色表示开盘价高于收盘价  
        color = 'g' if cl >= op else 'r'  
        plt.plot([i, i], [lo, hi], color=color)  
        plt.plot([i, i-0.2], [op, op], color=color)  
        plt.plot([i, i+0.2], [cl, cl], color=color)  
    
    plt.title('股价图表')  
    plt.xticks([])  # 移除x轴标签以简化图像  
    plt.savefig(output_path)  
    plt.close()  
    
    # 读取并预处理图像  
    image = cv2.imread(output_path)  
    image = cv2.resize(image, (224, 224))  # 调整为模型输入尺寸  
    return image  

# 生成训练数据  
windows = []  
labels = []  # 假设有预定义的形态标签  

for i in range(len(stock_data) - 50):  
    window_data = stock_data.iloc[i:i+50]  
    image = generate_chart_image(window_data)  
    windows.append(image)  
    # 这里需要实际的标签数据,可以是手动标注或使用规则自动生成  
    # labels.append(...)  

    # 通过未来5个交易日的价格变动生成标签  
    if i + 55 < len(stock_data):  
        future_return = (stock_data.iloc[i+55]['Close'] / stock_data.iloc[i+50]['Close'] - 1) * 100  
        if future_return > 3:  
            pattern_label = 0  # 大幅上涨  
        elif future_return > 1:  
            pattern_label = 1  # 小幅上涨  
        elif future_return < -3:  
            pattern_label = 2  # 大幅下跌  
        elif future_return < -1:  
            pattern_label = 3  # 小幅下跌  
        else:  
            pattern_label = 4  # 盘整  
        
        labels.append(pattern_label)  
    else:  
        # 对于末尾没有足够未来数据的窗口,移除对应的X数据  
        windows.pop()  


X = np.array(windows)  
y = np.array(labels)  

# 可视化一个样本图表(可选)  
plt.figure(figsize=(8, 8))  
plt.imshow(cv2.cvtColor(X[0], cv2.COLOR_BGR2RGB))  
plt.title('比亚迪K线图样本')  
plt.axis('off')  
plt.show()
  1. CNN模型构建:
In [15]:
from tensorflow.keras.models import Sequential  
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout  
from tensorflow.keras.optimizers import Adam  
from sklearn.model_selection import train_test_split  

# 分割数据  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  

# 构建CNN模型  
model = Sequential([  
    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),  
    MaxPooling2D((2, 2)),  
    Conv2D(64, (3, 3), activation='relu'),  
    MaxPooling2D((2, 2)),  
    Conv2D(128, (3, 3), activation='relu'),  
    MaxPooling2D((2, 2)),  
    Flatten(),  
    Dense(128, activation='relu'),  
    Dropout(0.5),  
    Dense(len(np.unique(y)), activation='softmax')  # 输出层,根据形态类别数量调整  
])  

model.compile(optimizer=Adam(learning_rate=0.001),  
             loss='sparse_categorical_crossentropy',  
             metrics=['accuracy'])  

# 训练模型  
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))
Epoch 1/20
/home/kk/softwares/anaconda/lib/python3.11/site-packages/keras/src/layers/convolutional/base_conv.py:107: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
E0000 00:00:1747213794.425674 1756560 cuda_executor.cc:1228] INTERNAL: CUDA Runtime error: Failed call to cudaGetRuntimeVersion: Error loading CUDA libraries. GPU will not be used.: Error loading CUDA libraries. GPU will not be used.
W0000 00:00:1747213794.427614 1756560 gpu_device.cc:2341] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
17/17 ━━━━━━━━━━━━━━━━━━━━ 8s 415ms/step - accuracy: 0.2649 - loss: 361.0623 - val_accuracy: 0.1407 - val_loss: 1.6813
Epoch 2/20
17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 415ms/step - accuracy: 0.1765 - loss: 1.6520 - val_accuracy: 0.2000 - val_loss: 1.6014
Epoch 3/20
17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 421ms/step - accuracy: 0.3033 - loss: 1.5905 - val_accuracy: 0.3852 - val_loss: 1.5929
Epoch 4/20
17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 399ms/step - accuracy: 0.3502 - loss: 1.5898 - val_accuracy: 0.3704 - val_loss: 1.5989
Epoch 5/20
17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 386ms/step - accuracy: 0.3577 - loss: 1.5987 - val_accuracy: 0.3704 - val_loss: 1.5942
Epoch 6/20
17/17 ━━━━━━━━━━━━━━━━━━━━ 6s 382ms/step - accuracy: 0.3500 - loss: 1.5964 - val_accuracy: 0.3704 - val_loss: 1.5893
Epoch 7/20
17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 388ms/step - accuracy: 0.3846 - loss: 1.5874 - val_accuracy: 0.3704 - val_loss: 1.5840
Epoch 8/20
17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 385ms/step - accuracy: 0.3418 - loss: 1.5842 - val_accuracy: 0.3704 - val_loss: 1.5792
Epoch 9/20
17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 387ms/step - accuracy: 0.3429 - loss: 1.5808 - val_accuracy: 0.3704 - val_loss: 1.5747
Epoch 10/20
17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 392ms/step - accuracy: 0.3787 - loss: 1.5756 - val_accuracy: 0.3704 - val_loss: 1.5701
Epoch 11/20
17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 390ms/step - accuracy: 0.3400 - loss: 1.5753 - val_accuracy: 0.3704 - val_loss: 1.5659
Epoch 12/20
17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 409ms/step - accuracy: 0.3806 - loss: 1.5664 - val_accuracy: 0.3704 - val_loss: 1.5619
Epoch 13/20
17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 389ms/step - accuracy: 0.3659 - loss: 1.5632 - val_accuracy: 0.3704 - val_loss: 1.5580
Epoch 14/20
17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 386ms/step - accuracy: 0.3542 - loss: 1.5592 - val_accuracy: 0.3704 - val_loss: 1.5542
Epoch 15/20
17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 387ms/step - accuracy: 0.3416 - loss: 1.5595 - val_accuracy: 0.3704 - val_loss: 1.5507
Epoch 16/20
17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 384ms/step - accuracy: 0.3370 - loss: 1.5538 - val_accuracy: 0.3704 - val_loss: 1.5472
Epoch 17/20
17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 387ms/step - accuracy: 0.3447 - loss: 1.5570 - val_accuracy: 0.3704 - val_loss: 1.5439
Epoch 18/20
17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 388ms/step - accuracy: 0.3735 - loss: 1.5449 - val_accuracy: 0.3704 - val_loss: 1.5407
Epoch 19/20
17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 389ms/step - accuracy: 0.3549 - loss: 1.5376 - val_accuracy: 0.3704 - val_loss: 1.5379
Epoch 20/20
17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 389ms/step - accuracy: 0.3598 - loss: 1.5380 - val_accuracy: 0.3704 - val_loss: 1.5351
  1. 模型评估与可视化:
In [16]:
# 评估模型  
test_loss, test_acc = model.evaluate(X_test, y_test)  
print(f'Test accuracy: {test_acc}')  

# 可视化训练过程  
plt.figure(figsize=(12, 4))  
plt.subplot(1, 2, 1)  
plt.plot(history.history['accuracy'], 'b-', label='Training Accuracy')  
plt.plot(history.history['val_accuracy'], 'r-', label='Validation Accuracy')  
plt.legend()  
plt.title('Model Accuracy')  

plt.subplot(1, 2, 2)  
plt.plot(history.history['loss'], 'b-', label='Training Loss')  
plt.plot(history.history['val_loss'], 'r-', label='Validation Loss')  
plt.legend()  
plt.title('Model Loss')  

plt.tight_layout()  
plt.show()
5/5 ━━━━━━━━━━━━━━━━━━━━ 0s 74ms/step - accuracy: 0.4012 - loss: 1.5279
Test accuracy: 0.37037035822868347
  1. 形态识别与交易信号生成:
In [17]:
# 使用模型识别新图像  
def predict_pattern(image):  
    image = cv2.resize(image, (224, 224))  
    image = np.expand_dims(image, axis=0)  
    prediction = model.predict(image)  
    pattern_index = np.argmax(prediction[0])  
    confidence = prediction[0][pattern_index]  
    return pattern_index, confidence  

# 交易信号生成  
def generate_trading_signal(pattern_index, confidence):  
    # 根据识别的形态生成交易信号  
    # 这里只是示例逻辑,实际应用需要更复杂的规则  
    if pattern_index == 0 and confidence > 0.7:  # 假设0是看涨形态  
        return "BUY"  
    elif pattern_index == 1 and confidence > 0.7:  # 假设1是看跌形态  
        return "SELL"  
    else:  
        return "HOLD"