import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from pandas.plotting import register_matplotlib_converters
%matplotlib inline
%config InlineBackend.figure_format='retina'
register_matplotlib_converters()
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
rcParams['figure.figsize'] = 22, 10
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
df = pd.read_csv('spx.csv', parse_dates=['date'], index_col='date')
df.head()
close | |
date | |
1986-01-02 | 209.59 |
1986-01-03 | 210.88 |
1986-01-06 | 210.65 |
1986-01-07 | 213.80 |
1986-01-08 | 207.97 |
plt.plot(df, label='close price')
plt.legend()
train_size = int(len(df) * 0.95)
test_size = len(df) - train_size
train, test = df.iloc[0:train_size], df.iloc[train_size:len(df)]
print(train.shape, test.shape)
(7782, 1) (410, 1)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler = scaler.fit(train[['close']])
train['close'] = scaler.transform(train[['close']])
test['close'] = scaler.transform(test[['close']])
def create_dataset(X, y, time_steps=1):
Xs, ys = [], []
for i in range(len(X) - time_steps):
v = X.iloc[i:(i + time_steps)].values
Xs.append(v)
ys.append(y.iloc[i + time_steps])
return np.array(Xs), np.array(ys)
TIME_STEPS = 30
# reshape to [samples, time_steps, n_features]
X_train, y_train = create_dataset(train[['close']], train.close, TIME_STEPS)
X_test, y_test = create_dataset(test[['close']], test.close, TIME_STEPS)
print(X_train.shape)
(7752, 30, 1)
model = keras.Sequential()
model.add(keras.layers.LSTM(
units=64,
input_shape=(X_train.shape[1], X_train.shape[2])
))
model.add(keras.layers.Dropout(rate=0.2))
model.add(keras.layers.RepeatVector(n=X_train.shape[1]))
model.add(keras.layers.LSTM(units=64, return_sequences=True))
model.add(keras.layers.Dropout(rate=0.2))
model.add(keras.layers.TimeDistributed(keras.layers.Dense(units=X_train.shape[2])))
model.compile(loss='mae', optimizer='adam')
history = model.fit(
X_train, y_train,
epochs=10,
batch_size=32,
validation_split=0.1,
shuffle=False
)
Train on 6976 samples, validate on 776 samples
Epoch 1/10
6976/6976 [==============================] - 7s 1ms/sample - loss: 0.1672 - val_loss: 0.1806
Epoch 2/10
6976/6976 [==============================] - 2s 277us/sample - loss: 0.0916 - val_loss: 0.2029
Epoch 3/10
6976/6976 [==============================] - 2s 278us/sample - loss: 0.0954 - val_loss: 0.1362
Epoch 4/10
6976/6976 [==============================] - 2s 279us/sample - loss: 0.0840 - val_loss: 0.1941
Epoch 5/10
6976/6976 [==============================] - 2s 261us/sample - loss: 0.0832 - val_loss: 0.1903
Epoch 6/10
6976/6976 [==============================] - 2s 268us/sample - loss: 0.0851 - val_loss: 0.2351
Epoch 7/10
6976/6976 [==============================] - 2s 257us/sample - loss: 0.0834 - val_loss: 0.1704
Epoch 8/10
6976/6976 [==============================] - 2s 261us/sample - loss: 0.0840 - val_loss: 0.1327
Epoch 9/10
6976/6976 [==============================] - 2s 261us/sample - loss: 0.0778 - val_loss: 0.1370
Epoch 10/10
6976/6976 [==============================] - 2s 257us/sample - loss: 0.1041 - val_loss: 0.1241
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
X_train_pred = model.predict(X_train)
train_mae_loss = np.mean(np.abs(X_train_pred - X_train), axis=1)
sns.distplot(train_mae_loss, bins=50, kde=True)
X_test_pred = model.predict(X_test)
test_mae_loss = np.mean(np.abs(X_test_pred - X_test), axis=1)
THRESHOLD = 0.65
test_score_df = pd.DataFrame(index=test[TIME_STEPS:].index)
test_score_df['loss'] = test_mae_loss
test_score_df['threshold'] = THRESHOLD
test_score_df['anomaly'] = test_score_df.loss > test_score_df.threshold
test_score_df['close'] = test[TIME_STEPS:].close
plt.plot(test_score_df.index, test_score_df.loss, label='loss')
plt.plot(test_score_df.index, test_score_df.threshold, label='threshold')
plt.xticks(rotation=25)
plt.legend()
anomalies = test_score_df[test_score_df.anomaly == True]
anomalies.head()
loss | threshold | anomaly | close | |
date | ||||
2018-02-06 | 0.723123 | 0.65 | True | 3.193456 |
2018-02-07 | 0.744779 | 0.65 | True | 3.168136 |
2018-02-08 | 0.754374 | 0.65 | True | 2.979068 |
2018-02-09 | 0.802155 | 0.65 | True | 3.051476 |
2018-02-12 | 0.811910 | 0.65 | True | 3.119939 |
plt.plot(
test[TIME_STEPS:].index,
scaler.inverse_transform(test[TIME_STEPS:].close),
label='close price'
);
sns.scatterplot(
anomalies.index,
scaler.inverse_transform(anomalies.close),
color=sns.color_palette()[3],
s=52,
label='anomaly'
)
plt.xticks(rotation=25)
plt.legend()
知乎学术咨询:https://www.zhihu.com/consult/people/792359672131756032?isMe=1
担任《Mechanical System and Signal Processing》等审稿专家,擅长领域:信号滤波/降噪,机器学习/深度学习,时间序列预分析/预测,设备故障诊断/缺陷检测/异常检测。
分割线
基于Maximin的异常检测方法(MATLAB)
完整代码:
https://mbd.pub/o/bread/mbd-Y56clpxx