StockPrediction/logistic_regression_only_re...

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import yfinance as yf
from datetime import datetime
import os, sys

from sklearn import preprocessing

#bodacious colors
colors=sns.color_palette("rocket", 8)
#Ram's colors, if desired
seshadri = ['#c3121e', '#0348a1', '#ffb01c', '#027608', '#0193b0', '#9c5300', '#949c01', '#7104b5']
#            0sangre,   1neptune,  2pumpkin,  3clover,   4denim,    5cocoa,    6cumin,    7berry

train_quota = 0.8

def enlarge_lag(to_enlarge, time_window=1):
    # to_enlarge is the data already present, should be a numpy array
    enlarged = []
    for i in range(to_enlarge.shape[0] - time_window + 1):
        new_element = []
        for j in range(time_window):
            new_element.extend(to_enlarge[i + time_window - 1 - j, :])
        enlarged.append(new_element)

    return np.array(enlarged)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))


def logreg_inference(x, w, b):
    z = (x @ w) + b
    p = sigmoid(z)
    return p


def cross_entropy(P, Y):
    return (-Y * np.log(P) - (1 - Y) * np.log(1 - P)).mean()


def logreg_train(X, Y, lambda_, lr = 1e-4, steps=100000):
    # The training samples are defined as such (each row of X is a sample):
    # X[0, :] -> Y[0]
    # X[1, :] -> Y[1]

    m, n = X.shape

    # Initial values for the parameters
    w = np.zeros(n)
    b = 0

    # Initial values for the "precedent loss" and "convergence" variables, used to check convergence
    prec_loss = 0
    convergence = 0

    for step in range(steps):
        P = logreg_inference(X, w, b)
        loss = cross_entropy(P, Y)


        if step % 1000 == 0:
            print(step, loss)

            # Difference between "precedent loss" and "current loss"
            diff = np.absolute(prec_loss - loss)
            prec_loss = loss
            if diff < 0.00001:
                # If convergence is reached, the algorithm is stopped
                convergence = step
                break

        # Derivative of the loss function with respect to bias
        grad_b = (P - Y).mean()

        # Gradient of the loss function with respect to weights
        grad_w = (X.T @ (P - Y)) / m

        w -= lr * grad_w
        b -= lr * grad_b

        # Every 100 iteration the values of accuracy and loss are saved for plotting
        if step%100 == 0:
            Yhat = (P > 0.5)
            acc_array.append((Y == Yhat).mean() * 100)
            losses.append(loss)

    # Print the iterations needed for convergence before returning
    print("Convergence = ", convergence)

    return w, b


if len(sys.argv) > 1:
    time_window = int(sys.argv[1])
else:
    time_window = 1

#time_window = 10

stock_data = pd.read_pickle("data/MSFT_data.pkl")

daily_returns = ((stock_data["Close"] - stock_data["Open"]) / stock_data["Open"]).to_numpy().reshape(-1,1)


# merge data into 2d numpy array
Y = np.zeros(daily_returns.shape[0] - 1)

print(daily_returns.shape, Y.shape)

for i in range(Y.size):
    if daily_returns[i+1] >= 0:
        Y[i] = 1
    else:
        Y[i] = 0
import copy
norm_features = copy.deepcopy(daily_returns)
if time_window > 1:
    norm_features = enlarge_lag(norm_features, time_window)
    Y = Y[time_window-1:]

train_size = int(norm_features.shape[0] * 0.8)
X_train = norm_features[:train_size, ]
Y_train = Y[:train_size]

X_test = norm_features[train_size:-1, ]
Y_test = Y[train_size:]


# Lists to save accuracy and loss
acc_array = []
losses = []

w, b = logreg_train(X_train, Y_train, 0.0, 1e-3, 1000000)
print("Weights: ", w)
print("Bias: ", b)

# Iterations vs Accuracy plot
#plt.figure()
#plt.plot(np.arange(0, len(acc_array)) * 100, acc_array)
#plt.xlabel("Iterations")
#plt.ylabel("Accuracy")
#
## Iterations vs Loss plot
#plt.figure()
#plt.plot(np.arange(0, len(acc_array)) * 100, losses)
#plt.xlabel("Iterations")
#plt.ylabel("Losses")
#
#plt.show()
# Training accuracy of the model, is the last value recorded in the array
print("Training Acc: ", acc_array[-1])

P_test = logreg_inference(X_test, w, b)
Yhat_test = (P_test > 0.5)
accuracy_test = (Y_test == Yhat_test).mean()
print("Test accuracy: ", 100*accuracy_test)


#lets try sklearn
#from sklearn.linear_model import LogisticRegression
#classifier = LogisticRegression(random_state=0, solver="saga").fit(X_train, Y_train)
#score = classifier.score(X_test, Y_test)
#print("sklearn score, all default: ", score)

with open("plots/data/logistic_regression_only_rets.csv", "a") as f:
    f.write(f"{time_window};{acc_array[-1]};{accuracy_test};\n")