StockPrediction/logistic_regression.py

228 lines
6.5 KiB
Python

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import yfinance as yf
from datetime import datetime
import os, sys
from sklearn import preprocessing
#bodacious colors
colors=sns.color_palette("rocket", 8)
#Ram's colors, if desired
seshadri = ['#c3121e', '#0348a1', '#ffb01c', '#027608', '#0193b0', '#9c5300', '#949c01', '#7104b5']
# 0sangre, 1neptune, 2pumpkin, 3clover, 4denim, 5cocoa, 6cumin, 7berry
train_quota = 0.8
def enlarge_lag(to_enlarge, time_window=1):
# to_enlarge is the data already present, should be a numpy array
enlarged = []
for i in range(to_enlarge.shape[0] - time_window + 1):
new_element = []
for j in range(time_window):
new_element.extend(to_enlarge[i + time_window - 1 - j, :])
enlarged.append(new_element)
return np.array(enlarged)
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def logreg_inference(x, w, b):
z = (x @ w) + b
p = sigmoid(z)
return p
def cross_entropy(P, Y):
return (-Y * np.log(P) - (1 - Y) * np.log(1 - P)).mean()
def logreg_train(X, Y, lambda_, lr = 1e-4, steps=100000):
# The training samples are defined as such (each row of X is a sample):
# X[0, :] -> Y[0]
# X[1, :] -> Y[1]
m, n = X.shape
# Initial values for the parameters
w = np.zeros(n)
b = 0
# Initial values for the "precedent loss" and "convergence" variables, used to check convergence
prec_loss = 0
convergence = 0
for step in range(steps):
P = logreg_inference(X, w, b)
loss = cross_entropy(P, Y)
if step % 1000 == 0:
print(step, loss)
# Difference between "precedent loss" and "current loss"
diff = np.absolute(prec_loss - loss)
prec_loss = loss
if diff < 0.00001:
# If convergence is reached, the algorithm is stopped
convergence = step
break
# Derivative of the loss function with respect to bias
grad_b = (P - Y).mean()
# Gradient of the loss function with respect to weights
grad_w = (X.T @ (P - Y)) / m
w -= lr * grad_w
b -= lr * grad_b
# Every 100 iteration the values of accuracy and loss are saved for plotting
if step%100 == 0:
Yhat = (P > 0.5)
acc_array.append((Y == Yhat).mean() * 100)
losses.append(loss)
# Print the iterations needed for convergence before returning
print("Convergence = ", convergence)
return w, b
if len(sys.argv) > 1:
time_window = int(sys.argv[1])
else:
time_window = 1
#time_window = 10
stock_data = pd.read_pickle("data/MSFT_data.pkl")
daily_returns = ((stock_data["Close"] - stock_data["Open"]) / stock_data["Open"]).to_numpy()
prices = stock_data[["Open", "High", "Low", "Close"]].to_numpy()
volume = stock_data["Volume"].to_numpy()
minmax_scaler = preprocessing.MinMaxScaler()
std_scaler = preprocessing.StandardScaler()
features = np.vstack((daily_returns, volume)).T
# Scale volume data to obtain better results
#minmax_scaler = preprocessing.MinMaxScaler()
#norm_ret = std_scaler.fit_transform(daily_returns.reshape(-1,1)).flatten()
#norm_vol = minmax_scaler.fit_transform(volume.reshape(-1,1)).flatten()
#norm_features = np.vstack((norm_ret, norm_vol)).T
# Solo volumi e ritorni
#norm_features = std_scaler.fit_transform(features)
# Aggiunta di prezzi
#norm_prices = minmax_scaler.fit_transform(prices.reshape(-1, 1)).reshape(-1, 4)
#norm_ret_and_vol = std_scaler.fit_transform(features)
#norm_features = np.hstack((norm_ret_and_vol, norm_prices))
# Necessary for MAs
part_features = std_scaler.fit_transform(features)
# Aggiunta SMA
#SMA_20 = stock_data["Close"].rolling(20).mean().to_numpy()
#SMA_50 = stock_data["Close"].rolling(50).mean().to_numpy()
#SMA_200 = stock_data["Close"].rolling(200).mean().to_numpy()
#SMAs = np.vstack((SMA_20, SMA_50)).T
#norm_SMAs = minmax_scaler.fit_transform(SMAs[49:, ].reshape(-1, 1)).reshape(-1, 2)
#norm_features = np.hstack((part_features[49:, ], norm_SMAs))
#SMAs = np.vstack((SMA_20, SMA_50, SMA_200)).T
#norm_SMAs = minmax_scaler.fit_transform(SMAs[199:, ].reshape(-1, 1)).reshape(-1, 3)
#norm_features = np.hstack((part_features[199:, ], norm_SMAs))
# Aggiunta EMA
EMA_20 = stock_data["Close"].ewm(span=20, adjust=False).mean()
EMA_50 = stock_data["Close"].ewm(span=50, adjust=False).mean()
EMAs = np.vstack((EMA_20, EMA_50)).T
norm_EMAs = minmax_scaler.fit_transform(EMAs.reshape(-1, 1)).reshape(-1, 2)
#EMA_200 = stock_data["Close"].ewm(span=200, adjust=False).mean()
#EMAs = np.vstack((EMA_20, EMA_50, EMA_200)).T
#norm_EMAs = minmax_scaler.fit_transform(EMAs.reshape(-1, 1)).reshape(-1, 3)
norm_features = np.hstack((part_features, norm_EMAs))
# merge data into 2d numpy array
Y = np.zeros(features.shape[0] - 1)
for i in range(Y.size):
if daily_returns[i+1] >= 0:
Y[i] = 1
else:
Y[i] = 0
# per quando su usano ma fino a 200
#Y = Y[49:]
#Y = Y[199:]
print(norm_features.shape, Y.shape)
if time_window > 1:
norm_features = enlarge_lag(norm_features, time_window)
Y = Y[time_window-1:]
train_size = int(norm_features.shape[0] * 0.8)
X_train = norm_features[:train_size, ]
Y_train = Y[:train_size]
X_test = norm_features[train_size:-1, ]
Y_test = Y[train_size:]
#if time_window > 1:
# X_train = enlarge_lag(X_train)
# Y_train = Y_train[time_window-1:]
#
# X_test = enlarge_lag(X_test)
# Y_test = Y_test[time_window-1:]
# Lists to save accuracy and loss
acc_array = []
losses = []
w, b = logreg_train(X_train, Y_train, 0.0, 1e-3, 1000000)
print("Weights: ", w)
print("Bias: ", b)
# Iterations vs Accuracy plot
#plt.figure()
#plt.plot(np.arange(0, len(acc_array)) * 100, acc_array)
#plt.xlabel("Iterations")
#plt.ylabel("Accuracy")
#
## Iterations vs Loss plot
#plt.figure()
#plt.plot(np.arange(0, len(acc_array)) * 100, losses)
#plt.xlabel("Iterations")
#plt.ylabel("Losses")
#
#plt.show()
# Training accuracy of the model, is the last value recorded in the array
print("Training Acc: ", acc_array[-1])
P_test = logreg_inference(X_test, w, b)
Yhat_test = (P_test > 0.5)
accuracy_test = (Y_test == Yhat_test).mean()
print("Test accuracy: ", 100*accuracy_test)
#lets try sklearn
#from sklearn.linear_model import LogisticRegression
#classifier = LogisticRegression(random_state=0, solver="saga").fit(X_train, Y_train)
#score = classifier.score(X_test, Y_test)
#print("sklearn score, all default: ", score)
with open("plots/data/logistic_regression_EMA_20_50.csv", "a") as f:
f.write(f"{time_window};{acc_array[-1]};{accuracy_test};\n")