目标:通过分析金融新闻文本的情感,预测市场短期走势
数据:
分析流程:
# !pip install pandas numpy matplotlib scikit-learn gensim nltk transformers torch pyLDAvis
# 以上安装命令可根据需要取消注释
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora, models
import pyLDAvis.gensim_models
from transformers import BertTokenizer, BertForSequenceClassification
import torch
# 下载必要的NLTK资源
# nltk.download('punkt')
# nltk.download('stopwords')
# 数据加载示例
news_data = pd.read_csv('financial_news.csv')
market_data = pd.read_csv('market_index.csv')
def preprocess_text(text):
# 转小写
text = text.lower()
# 分词
tokens = word_tokenize(text)
# 去停用词
stop_words = set(stopwords.words('english'))
financial_stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
'company', 'stock', 'market', 'price', 'share'}
stop_words = stop_words.union(financial_stop_words)
filtered_tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
return filtered_tokens
# 应用预处理
news_data['tokens'] = news_data['text'].apply(preprocess_text)
# 创建词典
dictionary = corpora.Dictionary(news_data['tokens'])
# 过滤极端频率的词
dictionary.filter_extremes(no_below=5, no_above=0.5)
# 创建文档-词频矩阵
corpus = [dictionary.doc2bow(doc) for doc in news_data['tokens']]
# 训练LDA模型
num_topics = 10 # 可以尝试不同的主题数
lda_model = models.LdaModel(
corpus=corpus,
id2word=dictionary,
num_topics=num_topics,
passes=10,
alpha='auto',
eta='auto'
)
# 查看主题
for idx, topic in lda_model.print_topics(-1):
print(f'Topic {idx}: {topic}')
# 可视化LDA结果
lda_vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_vis)
# 为每篇文章分配主题
def get_dominant_topic(ldamodel, corpus, texts):
topic_probabilities = []
for i, row in enumerate(ldamodel[corpus]):
row = sorted(row, key=lambda x: x[1], reverse=True)
# 获取概率最高的主题
topic_num = row[0][0]
topic_prob = row[0][1]
topic_probabilities.append((topic_num, topic_prob))
return topic_probabilities
topic_results = get_dominant_topic(lda_model, corpus, news_data['tokens'])
news_data['dominant_topic'] = [result[0] for result in topic_results]
news_data['topic_probability'] = [result[1] for result in topic_results]
# 按日期聚合主题分布
daily_topics = news_data.groupby(['date', 'dominant_topic']).size().unstack(fill_value=0)
# 归一化,计算每日各主题占比
daily_topics = daily_topics.div(daily_topics.sum(axis=1), axis=0)
# 使用FinBERT进行情感分析
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')
def analyze_sentiment(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
outputs = model(**inputs)
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
return probabilities.detach().numpy()[0] # 返回正面、负面、中性概率
# 应用到新闻样本(可以是部分样本,以节省计算时间)
sample_size = min(500, len(news_data)) # 根据计算资源调整样本大小
news_sample = news_data.sample(sample_size, random_state=42)
news_sample['sentiment_scores'] = news_sample['text'].apply(analyze_sentiment)
# 提取各情感分数
news_sample['positive'] = news_sample['sentiment_scores'].apply(lambda x: x[0])
news_sample['negative'] = news_sample['sentiment_scores'].apply(lambda x: x[1])
news_sample['neutral'] = news_sample['sentiment_scores'].apply(lambda x: x[2])
# 按日期聚合情感分数
daily_sentiment = news_sample.groupby('date').agg({
'positive': 'mean',
'negative': 'mean',
'neutral': 'mean'
}).reset_index()
# 合并情感、主题和市场数据
merged_data = pd.merge(daily_sentiment, daily_topics, on='date')
merged_data = pd.merge(merged_data, market_data, on='date')
# 创建预测目标:次日回报率
merged_data['next_day_return'] = merged_data['market_index'].pct_change().shift(-1)
# 移除缺失值
merged_data = merged_data.dropna()
# 特征工程
X = merged_data[['positive', 'negative', 'neutral'] +
[f'topic_{i}' for i in range(num_topics)]] # 情感和主题特征
y = (merged_data['next_day_return'] > 0).astype(int) # 二分类:上涨或下跌
# 分割数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 训练随机森林分类器
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# 评估模型
from sklearn.metrics import classification_report, confusion_matrix
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))
# 特征重要性分析
feature_imp = pd.DataFrame(sorted(zip(model.feature_importances_, X.columns)),
columns=['Value','Feature'])
plt.figure(figsize=(10, 6))
plt.barh(feature_imp['Feature'], feature_imp['Value'])
plt.title('Feature Importance for Market Direction Prediction')
plt.tight_layout()
plt.show()
# 可视化主题与市场走势
plt.figure(figsize=(12, 8))
# 绘制主要主题趋势
top_topics = 3 # 选择最重要的几个主题
top_topic_indices = feature_imp[feature_imp['Feature'].str.contains('topic')].nlargest(top_topics, 'Value')['Feature']
for topic_col in top_topic_indices:
topic_idx = int(topic_col.split('_')[1])
plt.plot(merged_data['date'], merged_data[topic_col], label=f'Topic {topic_idx}')
plt.title('Topic Trends Over Time')
plt.legend()
plt.subplot(2, 1, 2)
plt.plot(merged_data['date'], merged_data['market_index'], 'k-')
plt.title('Market Index')
plt.tight_layout()
plt.show()
# 展示主题词云
from wordcloud import WordCloud
def plot_topic_wordcloud(lda_model, topic_id):
topic_words = dict(lda_model.show_topic(topic_id, 30))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(topic_words)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(f'Topic {topic_id} Word Cloud')
plt.show()
# 为重要主题生成词云
for topic_col in top_topic_indices:
topic_idx = int(topic_col.split('_')[1])
plot_topic_wordcloud(lda_model, topic_idx)
目标:通过卷积神经网络识别股票K线图中的技术形态,生成交易信号
数据:
分析流程:
# %pip install tushare
# %pip install opencv-python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tushare as ts
from PIL import Image
import cv2
# 设置Tushare API Token
# ts.set_token('YOUR_TUSHARE_TOKEN')
ts.set_token('f32443eb3d8addef6e131592ad68fa95b18c33725c3a52385330b892')
pro = ts.pro_api()
# 获取比亚迪股票数据
def get_stock_data(start_date='20200101', end_date='20230101'):
# 获取日线数据
df = pro.daily(ts_code='002594.SZ', start_date=start_date, end_date=end_date)
# 数据预处理:
# 1. 按照日期排序(Tushare返回的数据默认是从新到旧)
df = df.sort_values('trade_date')
# 2. 重命名列以匹配原代码
df = df.rename(columns={
'trade_date': 'date',
'open': 'Open',
'high': 'High',
'low': 'Low',
'close': 'Close',
'vol': 'Volume'
})
# 3. 设置日期为索引并转换为日期格式
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
df.set_index('date', inplace=True)
return df
# 获取比亚迪股票数据
stock_data = get_stock_data()
# 生成K线图并保存为图像
def generate_chart_image(data, window_size=50, output_path='chart.png'):
plt.figure(figsize=(10, 6))
# 绘制K线图
for i in range(len(data)-window_size, len(data)):
date = data.index[i]
op, hi, lo, cl = data.loc[date, ['Open', 'High', 'Low', 'Close']]
# 绿色表示收盘价高于开盘价,红色表示开盘价高于收盘价
color = 'g' if cl >= op else 'r'
plt.plot([i, i], [lo, hi], color=color)
plt.plot([i, i-0.2], [op, op], color=color)
plt.plot([i, i+0.2], [cl, cl], color=color)
plt.title('股价图表')
plt.xticks([]) # 移除x轴标签以简化图像
plt.savefig(output_path)
plt.close()
# 读取并预处理图像
image = cv2.imread(output_path)
image = cv2.resize(image, (224, 224)) # 调整为模型输入尺寸
return image
# 生成训练数据
windows = []
labels = [] # 假设有预定义的形态标签
for i in range(len(stock_data) - 50):
window_data = stock_data.iloc[i:i+50]
image = generate_chart_image(window_data)
windows.append(image)
# 这里需要实际的标签数据,可以是手动标注或使用规则自动生成
# labels.append(...)
# 通过未来5个交易日的价格变动生成标签
if i + 55 < len(stock_data):
future_return = (stock_data.iloc[i+55]['Close'] / stock_data.iloc[i+50]['Close'] - 1) * 100
if future_return > 3:
pattern_label = 0 # 大幅上涨
elif future_return > 1:
pattern_label = 1 # 小幅上涨
elif future_return < -3:
pattern_label = 2 # 大幅下跌
elif future_return < -1:
pattern_label = 3 # 小幅下跌
else:
pattern_label = 4 # 盘整
labels.append(pattern_label)
else:
# 对于末尾没有足够未来数据的窗口,移除对应的X数据
windows.pop()
X = np.array(windows)
y = np.array(labels)
# 可视化一个样本图表(可选)
plt.figure(figsize=(8, 8))
plt.imshow(cv2.cvtColor(X[0], cv2.COLOR_BGR2RGB))
plt.title('比亚迪K线图样本')
plt.axis('off')
plt.show()
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
# 分割数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 构建CNN模型
model = Sequential([
Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
MaxPooling2D((2, 2)),
Conv2D(64, (3, 3), activation='relu'),
MaxPooling2D((2, 2)),
Conv2D(128, (3, 3), activation='relu'),
MaxPooling2D((2, 2)),
Flatten(),
Dense(128, activation='relu'),
Dropout(0.5),
Dense(len(np.unique(y)), activation='softmax') # 输出层,根据形态类别数量调整
])
model.compile(optimizer=Adam(learning_rate=0.001),
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
# 训练模型
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))
Epoch 1/20
/home/kk/softwares/anaconda/lib/python3.11/site-packages/keras/src/layers/convolutional/base_conv.py:107: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead. super().__init__(activity_regularizer=activity_regularizer, **kwargs) E0000 00:00:1747213794.425674 1756560 cuda_executor.cc:1228] INTERNAL: CUDA Runtime error: Failed call to cudaGetRuntimeVersion: Error loading CUDA libraries. GPU will not be used.: Error loading CUDA libraries. GPU will not be used. W0000 00:00:1747213794.427614 1756560 gpu_device.cc:2341] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform. Skipping registering GPU devices...
17/17 ━━━━━━━━━━━━━━━━━━━━ 8s 415ms/step - accuracy: 0.2649 - loss: 361.0623 - val_accuracy: 0.1407 - val_loss: 1.6813 Epoch 2/20 17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 415ms/step - accuracy: 0.1765 - loss: 1.6520 - val_accuracy: 0.2000 - val_loss: 1.6014 Epoch 3/20 17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 421ms/step - accuracy: 0.3033 - loss: 1.5905 - val_accuracy: 0.3852 - val_loss: 1.5929 Epoch 4/20 17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 399ms/step - accuracy: 0.3502 - loss: 1.5898 - val_accuracy: 0.3704 - val_loss: 1.5989 Epoch 5/20 17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 386ms/step - accuracy: 0.3577 - loss: 1.5987 - val_accuracy: 0.3704 - val_loss: 1.5942 Epoch 6/20 17/17 ━━━━━━━━━━━━━━━━━━━━ 6s 382ms/step - accuracy: 0.3500 - loss: 1.5964 - val_accuracy: 0.3704 - val_loss: 1.5893 Epoch 7/20 17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 388ms/step - accuracy: 0.3846 - loss: 1.5874 - val_accuracy: 0.3704 - val_loss: 1.5840 Epoch 8/20 17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 385ms/step - accuracy: 0.3418 - loss: 1.5842 - val_accuracy: 0.3704 - val_loss: 1.5792 Epoch 9/20 17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 387ms/step - accuracy: 0.3429 - loss: 1.5808 - val_accuracy: 0.3704 - val_loss: 1.5747 Epoch 10/20 17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 392ms/step - accuracy: 0.3787 - loss: 1.5756 - val_accuracy: 0.3704 - val_loss: 1.5701 Epoch 11/20 17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 390ms/step - accuracy: 0.3400 - loss: 1.5753 - val_accuracy: 0.3704 - val_loss: 1.5659 Epoch 12/20 17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 409ms/step - accuracy: 0.3806 - loss: 1.5664 - val_accuracy: 0.3704 - val_loss: 1.5619 Epoch 13/20 17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 389ms/step - accuracy: 0.3659 - loss: 1.5632 - val_accuracy: 0.3704 - val_loss: 1.5580 Epoch 14/20 17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 386ms/step - accuracy: 0.3542 - loss: 1.5592 - val_accuracy: 0.3704 - val_loss: 1.5542 Epoch 15/20 17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 387ms/step - accuracy: 0.3416 - loss: 1.5595 - val_accuracy: 0.3704 - val_loss: 1.5507 Epoch 16/20 17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 384ms/step - accuracy: 0.3370 - loss: 1.5538 - val_accuracy: 0.3704 - val_loss: 1.5472 Epoch 17/20 17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 387ms/step - accuracy: 0.3447 - loss: 1.5570 - val_accuracy: 0.3704 - val_loss: 1.5439 Epoch 18/20 17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 388ms/step - accuracy: 0.3735 - loss: 1.5449 - val_accuracy: 0.3704 - val_loss: 1.5407 Epoch 19/20 17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 389ms/step - accuracy: 0.3549 - loss: 1.5376 - val_accuracy: 0.3704 - val_loss: 1.5379 Epoch 20/20 17/17 ━━━━━━━━━━━━━━━━━━━━ 7s 389ms/step - accuracy: 0.3598 - loss: 1.5380 - val_accuracy: 0.3704 - val_loss: 1.5351
# 评估模型
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_acc}')
# 可视化训练过程
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], 'b-', label='Training Accuracy')
plt.plot(history.history['val_accuracy'], 'r-', label='Validation Accuracy')
plt.legend()
plt.title('Model Accuracy')
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], 'b-', label='Training Loss')
plt.plot(history.history['val_loss'], 'r-', label='Validation Loss')
plt.legend()
plt.title('Model Loss')
plt.tight_layout()
plt.show()
5/5 ━━━━━━━━━━━━━━━━━━━━ 0s 74ms/step - accuracy: 0.4012 - loss: 1.5279 Test accuracy: 0.37037035822868347
# 使用模型识别新图像
def predict_pattern(image):
image = cv2.resize(image, (224, 224))
image = np.expand_dims(image, axis=0)
prediction = model.predict(image)
pattern_index = np.argmax(prediction[0])
confidence = prediction[0][pattern_index]
return pattern_index, confidence
# 交易信号生成
def generate_trading_signal(pattern_index, confidence):
# 根据识别的形态生成交易信号
# 这里只是示例逻辑,实际应用需要更复杂的规则
if pattern_index == 0 and confidence > 0.7: # 假设0是看涨形态
return "BUY"
elif pattern_index == 1 and confidence > 0.7: # 假设1是看跌形态
return "SELL"
else:
return "HOLD"