2024-05-11 21:19:15 +00:00
51 changed files with 26440 additions and 0 deletions
--- a/LSTM.py
+++ b/LSTM.py
@ -0,0 +1,222 @@
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 import yfinance as yf
 from datetime import datetime
 import os, sys
 from sklearn import preprocessing
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import Dense, Dropout, LSTM
 from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
 #bodacious colors
 colors=sns.color_palette("rocket", 8)
 #Ram's colors, if desired
 seshadri = ['#c3121e', '#0348a1', '#ffb01c', '#027608', '#0193b0', '#9c5300', '#949c01', '#7104b5']
 #            0sangre,   1neptune,  2pumpkin,  3clover,   4denim,    5cocoa,    6cumin,    7berry
 def enlarge_lag(to_enlarge, time_window=1):
    # to_enlarge is the data already present, should be a numpy array
    enlarged = []
    for i in range(to_enlarge.shape[0] - time_window + 1):
        new_element = []
        for j in range(time_window):
            new_element.extend(to_enlarge[i + time_window - 1 - j, :])
        enlarged.append(new_element)
    return np.array(enlarged)
 #### Calculate the metrics RMSE and MAPE ####
 def calculate_rmse(y_true, y_pred):
    """
    Calculate the Root Mean Squared Error (RMSE)
    """
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    return rmse
 def calculate_mape(y_true, y_pred):
    """
    Calculate the Mean Absolute Percentage Error (MAPE) %
    """
    y_pred, y_true = np.array(y_pred), np.array(y_true)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return mape
 train_quota = 0.8
 if len(sys.argv) > 1:
    time_window = int(sys.argv[1])
 else:
    time_window = 1
 #time_window = 10
 stock_data = pd.read_pickle("data/MSFT_data.pkl")
 price = stock_data["Close"].to_numpy()
 volume = stock_data["Volume"].to_numpy()
 daily_returns = ((stock_data["Close"] - stock_data["Open"]) / stock_data["Open"]).to_numpy()
 minmax_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
 features = np.vstack((price, volume)).T
 # Necessary for MAs
 norm_features = minmax_scaler.fit_transform(price.reshape(-1, 1))
 # merge data into 2d numpy array
 Y = np.zeros(features.shape[0] - 1)
 for i in range(Y.size):
    Y[i] = norm_features[i+1, 0]
 time_window = 20
 if time_window > 1:
    norm_features = enlarge_lag(norm_features, time_window)
    Y = Y[time_window-1:]
 print(norm_features.shape, Y.shape)
 train_size = int(norm_features.shape[0] * 0.8)
 X_train = norm_features[:train_size, ]
 Y_train = Y[:train_size]
 X_test = norm_features[train_size:-1, ]
 Y_test = Y[train_size:]
 def LSTM_model():
    model = Sequential()
    model.add(LSTM(units = 50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
    model.add(Dropout(0.2))
    model.add(LSTM(units=50, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(units=50))
    model.add(Dropout(0.2))
    model.add(Dense(units=1))
    return model
 model = LSTM_model()
 model.summary()
 model.compile(
    optimizer="adam",
    loss="mean_squared_error"
 )
 # Save weights only for best model
 checkpointer = ModelCheckpoint(
    filepath = 'weights_best.hdf5', 
    verbose = 2, 
    save_best_only = True
 )
 if os.path.exists("./checkpoints/checkpoint"):
    model.load_weights("./checkpoints/my_checkpoint")
 else:
    model.fit(
        X_train, 
        Y_train, 
        epochs=25, 
        batch_size = 32,
        callbacks = [checkpointer]
    )
    model.save_weights("./checkpoints/my_checkpoint")
 prediction = model.predict(X_test)
 predicted_prices = minmax_scaler.inverse_transform(prediction).flatten()
 counter = 0
 #for i in range(prediction.shape[0]-1):
 #    if (prediction[i+1,] - prediction[i,] > 0 and predicted_prices[i+1,] - predicted_prices[i,] > 0) or (prediction[i+1,] - prediction[i,] < 0 and predicted_prices[i+1,] - predicted_prices[i,] < 0):
 #        counter = counter + 1
 #print("acc: ", counter/prediction.shape[0])
 test_prices = price[time_window - 1 + train_size:]
 pred_ret = []
 actual_ret = []
 for j in range(len(test_prices) - 1):
    # il predicted price è il prezzo di domani, lo voglio confrontare con il ritorno effettivo di domani
    pred_ret.append((predicted_prices[j] - test_prices[j])/test_prices[j])
    actual_ret.append((test_prices[j+1] - test_prices[j])/test_prices[j])
 pred_ret_np = np.array(pred_ret)
 actual_ret_np = np.array(actual_ret)
 sign_comp = np.sum(np.sign(pred_ret_np) == np.sign(actual_ret_np))/len(pred_ret_np)
 sign_comp_red_nottoomuch = np.sum(np.sign(pred_ret_np[:200]) == np.sign(actual_ret_np[:200]))/len(pred_ret_np[:200])
 sign_comp_red = np.sum(np.sign(pred_ret_np[:100]) == np.sign(actual_ret_np[:100]))/len(pred_ret_np[:100])
 sign_comp_red_alot = np.sum(np.sign(pred_ret_np[:50]) == np.sign(actual_ret_np[:50]))/len(pred_ret_np[:50])
 print(sign_comp)
 print(sign_comp_red_nottoomuch)
 print(sign_comp_red)
 print(sign_comp_red_alot)
 rmse = calculate_rmse(test_prices[1:], predicted_prices)
 mape = calculate_mape(test_prices[1:], predicted_prices)
 print("RMSE: ", rmse)
 print("MAPE: ", mape)
 rmse = calculate_rmse(test_prices[1:301], predicted_prices[:300])
 mape = calculate_mape(test_prices[1:301], predicted_prices[:300])
 print("RMSE su 300 gg: ", rmse)
 print("MAPE su 300 gg: ", mape)
 #plt.plot(pred_ret, color=seshadri[0])
 #plt.plot(daily_returns[1:], color=seshadri[1])
 fig = plt.figure(1, figsize=(12,10))
 plt.plot(test_prices, color=seshadri[0], label="Registered Closing Price")
 plt.plot(predicted_prices, color=seshadri[1], label="Prediction")
 #plot params
 plt.xlim([0,1200])
 plt.ylim([100,400])
 plt.minorticks_on()
 plt.tick_params(labelsize=14)
 plt.tick_params(labelbottom=True, labeltop=False, labelright=False, labelleft=True)
 #xticks = np.arange(0, 1e4,10)
 #yticks = np.arange(0,16.1,4)
 plt.tick_params(direction='in',which='minor', length=5, bottom=True, top=True, left=True, right=True)
 plt.tick_params(direction='in',which='major', length=10, bottom=True, top=True, left=True, right=True)
 #plt.xticks(xticks)
 #plt.yticks(yticks)
 #plt.text(1,325, f'y={Decimal(coefs[3]):.4f}x$^3$+{Decimal(coefs[2]):.2f}x$^2$+{Decimal(coefs[1]):.2f}x+{Decimal(coefs[0]):.1f}',fontsize =13)
 plt.xlabel(r'Days (from last training)', fontsize=14) 
 plt.ylabel(r'Price (USD)',fontsize=14)  # label the y axis
 plt.legend(fontsize=14, loc="upper right", bbox_to_anchor=(0.99, 0.99))  # add the legend (will default to 'best' location)
 plt.savefig("plots/First_Attempt_LSTM_2.png", dpi=300)
 plt.show()
 #with open("plots/data/MLP_20_10_5_2.csv", "a") as f:
 #    f.write(f"{time_window};{train_score};{score};\n")
--- a/LSTM_advanced.py
+++ b/LSTM_advanced.py
@ -0,0 +1,238 @@
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 import yfinance as yf
 from datetime import datetime
 import os, sys
 from sklearn import preprocessing
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import Dense, Dropout, LSTM, Input
 from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
 #bodacious colors
 colors=sns.color_palette("rocket", 8)
 #Ram's colors, if desired
 seshadri = ['#c3121e', '#0348a1', '#ffb01c', '#027608', '#0193b0', '#9c5300', '#949c01', '#7104b5']
 #            0sangre,   1neptune,  2pumpkin,  3clover,   4denim,    5cocoa,    6cumin,    7berry
 np.set_printoptions(threshold=100)
 def enlarge_lag(to_enlarge, time_window=1):
    # to_enlarge is the data already present, should be a numpy array
    enlarged = []
    for i in range(to_enlarge.shape[0] - time_window + 1):
        new_element = []
        for j in range(time_window):
            new_element.extend(to_enlarge[i + time_window - 1 - j, :])
        enlarged.append(new_element)
    return np.array(enlarged)
 #### Calculate the metrics RMSE and MAPE ####
 def calculate_rmse(y_true, y_pred):
    """
    Calculate the Root Mean Squared Error (RMSE)
    """
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    return rmse
 def calculate_mape(y_true, y_pred):
    """
    Calculate the Mean Absolute Percentage Error (MAPE) %
    """
    y_pred, y_true = np.array(y_pred), np.array(y_true)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return mape
 train_quota = 0.8
 if len(sys.argv) > 1:
    time_window = int(sys.argv[1])
 else:
    time_window = 1
 #time_window = 10
 stock_data = pd.read_pickle("data/MSFT_data.pkl")
 price = stock_data["Close"].to_numpy()
 volume = stock_data["Volume"].to_numpy()
 daily_returns = ((stock_data["Close"] - stock_data["Open"]) / stock_data["Open"]).to_numpy()
 minmax_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
 sec_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
 #features = np.vstack((price, volume)).T
 # Necessary for MAs
 #norm_features = np.hstack((minmax_scaler.fit_transform(price.reshape(-1, 1)), sec_scaler.fit_transform(volume.reshape(-1, 1))))
 norm_features = minmax_scaler.fit_transform(price.reshape(-1, 1))
 rets = np.diff(price)
 bin_rets = np.zeros(len(rets))
 for i, r in enumerate(rets):
    if r >= 0:
        bin_rets[i] = 1
    else:
        bin_rets[i] = 0
 bin_rets_np = np.array(bin_rets)
 #norm_rets = sec_scaler.fit_transform(rets.reshape(-1, 1))
 print("occai")
 print(rets)
 print(bin_rets)
 print("ocai")
 # merge data into 2d numpy array
 #Y = np.zeros(norm_features.shape[0] - 1)
 #for i in range(Y.size):
 #    Y[i] = norm_features[i+1, 0]
 Y = bin_rets
 time_window = 20
 if time_window > 1:
    norm_features = enlarge_lag(norm_features, time_window)
    Y = Y[time_window-1:]
 train_size = int(norm_features.shape[0] * 0.8)
 X_train = norm_features[:train_size, ]
 Y_train = Y[:train_size]
 X_test = norm_features[train_size:-1, ]
 Y_test = Y[train_size:]
 def LSTM_model():
    model = Sequential()
    model.add(LSTM(units = 20, return_sequences=True, input_shape=(X_train.shape[1], 1)))
    model.add(Dropout(0.2))
    #model.add(LSTM(units=50, return_sequences=True))
    #model.add(Dropout(0.2))
    model.add(LSTM(units=20))
    model.add(Dropout(0.2))
    model.add(Dense(units=5))
    model.add(Dropout(0.3))
    model.add(Dense(units=1, activation="sigmoid"))
    return model
 model = LSTM_model()
 model.summary()
 model.compile(
    optimizer="adam",
    loss="mean_squared_error"
 )
 #if os.path.exists("./checkpoints/checkpoint"):
 #    model.load_weights("./checkpoints/my_checkpoint")
 #else:
 model.fit(
    X_train, 
    Y_train,
    shuffle=True,
    epochs=20,
    batch_size=20
 )
    #model.save_weights("./checkpoints/my_checkpoint")
 prediction = model.predict(X_test)
 print(prediction)
 print(model.evaluate(X_test, Y_test))
 #predicted_prices = minmax_scaler.inverse_transform(prediction).flatten()
 #predicted_rets = sec_scaler.inverse_transform(prediction).flatten()
 #print(predicted_rets)
 #counter = 0
 #for i in range(prediction.shape[0]-1):
 #    if (prediction[i+1,] - prediction[i,] > 0 and predicted_prices[i+1,] - predicted_prices[i,] > 0) or (prediction[i+1,] - prediction[i,] < 0 and predicted_prices[i+1,] - predicted_prices[i,] < 0):
 #        counter = counter + 1
 #print("acc: ", counter/prediction.shape[0])
 #test_prices = price[time_window - 1 + train_size:]
 #pred_ret = []
 #actual_ret = []
 #for j in range(len(test_prices) - 1):
 #    # il predicted price è il prezzo di domani, lo voglio confrontare con il ritorno effettivo di domani
 #    pred_ret.append((predicted_prices[j] - test_prices[j])/test_prices[j])
 #    actual_ret.append((test_prices[j+1] - test_prices[j])/test_prices[j])
 #
 #pred_ret_np = np.array(pred_ret)
 #actual_ret_np = np.array(actual_ret)
 #
 #sign_comp = np.sum(np.sign(pred_ret_np) == np.sign(actual_ret_np))/len(pred_ret_np)
 #sign_comp_red_nottoomuch = np.sum(np.sign(pred_ret_np[:200]) == np.sign(actual_ret_np[:200]))/len(pred_ret_np[:200])
 #sign_comp_red = np.sum(np.sign(pred_ret_np[:100]) == np.sign(actual_ret_np[:100]))/len(pred_ret_np[:100])
 #sign_comp_red_alot = np.sum(np.sign(pred_ret_np[:50]) == np.sign(actual_ret_np[:50]))/len(pred_ret_np[:50])
 #print(sign_comp)
 #print(sign_comp_red_nottoomuch)
 #print(sign_comp_red)
 #print(sign_comp_red_alot)
 #rmse = calculate_rmse(test_prices[1:], predicted_prices)
 #mape = calculate_mape(test_prices[1:], predicted_prices)
 #
 #print("RMSE: ", rmse)
 #print("MAPE: ", mape)
 #
 #rmse = calculate_rmse(test_prices[1:301], predicted_prices[:300])
 #mape = calculate_mape(test_prices[1:301], predicted_prices[:300])
 #
 #print("RMSE su 300 gg: ", rmse)
 #print("MAPE su 300 gg: ", mape)
 #plt.plot(pred_ret, color=seshadri[0])
 #plt.plot(daily_returns[1:], color=seshadri[1])
 fig = plt.figure(1, figsize=(12,10))
 plt.plot(Y_test, color=seshadri[0], label="Registered Closing Price")
 plt.plot(prediction, color=seshadri[1], label="Prediction")
 #plot params
 #plt.xlim([0,450])
 #plt.ylim([-0.5,16])
 plt.minorticks_on()
 plt.tick_params(labelsize=14)
 plt.tick_params(labelbottom=True, labeltop=False, labelright=False, labelleft=True)
 #xticks = np.arange(0, 1e4,10)
 #yticks = np.arange(0,16.1,4)
 plt.tick_params(direction='in',which='minor', length=5, bottom=True, top=True, left=True, right=True)
 plt.tick_params(direction='in',which='major', length=10, bottom=True, top=True, left=True, right=True)
 #plt.xticks(xticks)
 #plt.yticks(yticks)
 #plt.text(1,325, f'y={Decimal(coefs[3]):.4f}x$^3$+{Decimal(coefs[2]):.2f}x$^2$+{Decimal(coefs[1]):.2f}x+{Decimal(coefs[0]):.1f}',fontsize =13)
 plt.xlabel(r'Days (from last training)', fontsize=14) 
 plt.ylabel(r'Price (USD)',fontsize=14)  # label the y axis
 plt.legend(fontsize=14, loc="upper right", bbox_to_anchor=(0.99, 0.99))  # add the legend (will default to 'best' location)
 plt.savefig("plots/LSTM_advanced_rets_1.png", dpi=300)
 plt.show()
 #with open("plots/data/MLP_20_10_5_2.csv", "a") as f:
 #    f.write(f"{time_window};{train_score};{score};\n")
--- a/LSTM_advanced_returns.py
+++ b/LSTM_advanced_returns.py
@ -0,0 +1,230 @@
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 import yfinance as yf
 from datetime import datetime
 import os, sys
 from sklearn import preprocessing
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import Dense, Dropout, LSTM, Input
 from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
 #bodacious colors
 colors=sns.color_palette("rocket", 8)
 #Ram's colors, if desired
 seshadri = ['#c3121e', '#0348a1', '#ffb01c', '#027608', '#0193b0', '#9c5300', '#949c01', '#7104b5']
 #            0sangre,   1neptune,  2pumpkin,  3clover,   4denim,    5cocoa,    6cumin,    7berry
 np.set_printoptions(threshold=1000000)
 def enlarge_lag(to_enlarge, time_window=1):
    # to_enlarge is the data already present, should be a numpy array
    enlarged = []
    for i in range(to_enlarge.shape[0] - time_window + 1):
        new_element = []
        for j in range(time_window):
            new_element.extend(to_enlarge[i + time_window - 1 - j, :])
        enlarged.append(new_element)
    return np.array(enlarged)
 train_quota = 0.8
 if len(sys.argv) > 1:
    time_window = int(sys.argv[1])
 else:
    time_window = 1
 stock_data = pd.read_pickle("data/MSFT_data.pkl")
 price = stock_data["Close"].to_numpy()
 volume = stock_data["Volume"].to_numpy()
 #minmax_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
 minmax_scaler = preprocessing.StandardScaler()
 sec_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
 #EMA_20 = stock_data["Close"].ewm(span=20, adjust=False).mean()
 #EMA_50 = stock_data["Close"].ewm(span=50, adjust=False).mean()
 #EMAs = np.vstack((EMA_20, EMA_50)).T
 #norm_EMAs = minmax_scaler.fit_transform(EMAs.reshape(-1, 1)).reshape(-1, 2)
 #EMA_200 = stock_data["Close"].ewm(span=200, adjust=False).mean()
 #EMAs = np.vstack((EMA_20, EMA_50, EMA_200)).T
 #norm_EMAs = minmax_scaler.fit_transform(EMAs.reshape(-1, 1)).reshape(-1, 3)
 # Necessary for MAs
 #norm_features = np.hstack((minmax_scaler.fit_transform(price.reshape(-1, 1)), sec_scaler.fit_transform(volume.reshape(-1, 1))))
 norm_features = minmax_scaler.fit_transform(np.vstack((price, volume)).T)
 #norm_features = np.hstack((norm_features, norm_EMAs))
 rets = np.diff(price)
 bin_rets = np.zeros(len(rets))
 for i, r in enumerate(rets):
    if r >= 0:
        bin_rets[i] = 1
    else:
        bin_rets[i] = 0
 bin_rets_np = np.array(bin_rets)
 #norm_rets = sec_scaler.fit_transform(rets.reshape(-1, 1))
 print("occai")
 print(rets)
 print(bin_rets)
 print("ocai")
 # merge data into 2d numpy array
 #Y = np.zeros(norm_features.shape[0] - 1)
 #for i in range(Y.size):
 #    Y[i] = norm_features[i+1, 0]
 Y = bin_rets
 time_window = 3
 if time_window > 1:
    norm_features = enlarge_lag(norm_features, time_window)
    Y = Y[time_window-1:]
 train_size = int(norm_features.shape[0] * 0.8)
 X_train = norm_features[:train_size, ]
 Y_train = Y[:train_size].reshape(-1, 1)
 X_test = norm_features[train_size:-1, ]
 Y_test = Y[train_size:].reshape(-1, 1)
 def LSTM_model():
    model = Sequential()
    model.add(LSTM(units = 20, input_shape=(X_train.shape[1], 1)))
    #model.add(Dense(units = 20, activation="relu", input_shape=(X_train.shape[1],)))
    #model.add(Dropout(0.3))
    #model.add(LSTM(units=50, return_sequences=True))
    #model.add(Dropout(0.2))
    model.add(Dense(units=10, activation="relu"))
    model.add(Dense(units=5, activation="relu"))
    model.add(Dense(units=1, activation="sigmoid"))
    return model
 model = LSTM_model()
 model.summary()
 model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=['accuracy']
 )
 #if os.path.exists("./checkpoints/checkpoint"):
 #    model.load_weights("./checkpoints/my_checkpoint")
 #else:
 model.fit(
    X_train, 
    Y_train,
    shuffle=True,
    epochs=50,
    batch_size=32
 )
    #model.save_weights("./checkpoints/my_checkpoint")
 prediction = model.predict(X_test).flatten()
 print("pred: ", prediction)
 print(model.evaluate(X_test, Y_test))
 #predicted_prices = minmax_scaler.inverse_transform(prediction).flatten()
 #predicted_rets = sec_scaler.inverse_transform(prediction).flatten()
 #print(predicted_rets)
 #counter = 0
 #for i in range(prediction.shape[0]-1):
 #    if (prediction[i+1,] - prediction[i,] > 0 and predicted_prices[i+1,] - predicted_prices[i,] > 0) or (prediction[i+1,] - prediction[i,] < 0 and predicted_prices[i+1,] - predicted_prices[i,] < 0):
 #        counter = counter + 1
 #print("acc: ", counter/prediction.shape[0])
 #test_prices = price[time_window - 1 + train_size:]
 #pred_ret = []
 #actual_ret = []
 #for j in range(len(test_prices) - 1):
 #    # il predicted price è il prezzo di domani, lo voglio confrontare con il ritorno effettivo di domani
 #    pred_ret.append((predicted_prices[j] - test_prices[j])/test_prices[j])
 #    actual_ret.append((test_prices[j+1] - test_prices[j])/test_prices[j])
 #
 #pred_ret_np = np.array(pred_ret)
 #actual_ret_np = np.array(actual_ret)
 #
 #sign_comp = np.sum(np.sign(pred_ret_np) == np.sign(actual_ret_np))/len(pred_ret_np)
 #sign_comp_red_nottoomuch = np.sum(np.sign(pred_ret_np[:200]) == np.sign(actual_ret_np[:200]))/len(pred_ret_np[:200])
 #sign_comp_red = np.sum(np.sign(pred_ret_np[:100]) == np.sign(actual_ret_np[:100]))/len(pred_ret_np[:100])
 #sign_comp_red_alot = np.sum(np.sign(pred_ret_np[:50]) == np.sign(actual_ret_np[:50]))/len(pred_ret_np[:50])
 #print(sign_comp)
 #print(sign_comp_red_nottoomuch)
 #print(sign_comp_red)
 #print(sign_comp_red_alot)
 #rmse = calculate_rmse(test_prices[1:], predicted_prices)
 #mape = calculate_mape(test_prices[1:], predicted_prices)
 #
 #print("RMSE: ", rmse)
 #print("MAPE: ", mape)
 #
 #rmse = calculate_rmse(test_prices[1:301], predicted_prices[:300])
 #mape = calculate_mape(test_prices[1:301], predicted_prices[:300])
 #
 #print("RMSE su 300 gg: ", rmse)
 #print("MAPE su 300 gg: ", mape)
 #plt.plot(pred_ret, color=seshadri[0])
 #plt.plot(daily_returns[1:], color=seshadri[1])
 fig = plt.figure(1, figsize=(12,10))
 plt.plot(Y_test, color=seshadri[0], label="Registered Closing Price")
 plt.plot(prediction, color=seshadri[1], label="Prediction")
 #plot params
 #plt.xlim([0,450])
 #plt.ylim([-0.5,16])
 plt.minorticks_on()
 plt.tick_params(labelsize=14)
 plt.tick_params(labelbottom=True, labeltop=False, labelright=False, labelleft=True)
 #xticks = np.arange(0, 1e4,10)
 #yticks = np.arange(0,16.1,4)
 plt.tick_params(direction='in',which='minor', length=5, bottom=True, top=True, left=True, right=True)
 plt.tick_params(direction='in',which='major', length=10, bottom=True, top=True, left=True, right=True)
 #plt.xticks(xticks)
 #plt.yticks(yticks)
 #plt.text(1,325, f'y={Decimal(coefs[3]):.4f}x$^3$+{Decimal(coefs[2]):.2f}x$^2$+{Decimal(coefs[1]):.2f}x+{Decimal(coefs[0]):.1f}',fontsize =13)
 plt.xlabel(r'Days (from last training)', fontsize=14) 
 plt.ylabel(r'Price (USD)',fontsize=14)  # label the y axis
 plt.legend(fontsize=14, loc="upper right", bbox_to_anchor=(0.99, 0.99))  # add the legend (will default to 'best' location)
 plt.savefig("plots/LSTM_advanced_rets_1.png", dpi=300)
 plt.show()
 #with open("plots/data/MLP_20_10_5_2.csv", "a") as f:
 #    f.write(f"{time_window};{train_score};{score};\n")
--- a/MultiLayer_Perceptron.py
+++ b/MultiLayer_Perceptron.py
@ -0,0 +1,119 @@
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 import yfinance as yf
 from datetime import datetime
 import os, sys
 from sklearn import preprocessing
 #bodacious colors
 colors=sns.color_palette("rocket", 8)
 #Ram's colors, if desired
 seshadri = ['#c3121e', '#0348a1', '#ffb01c', '#027608', '#0193b0', '#9c5300', '#949c01', '#7104b5']
 #            0sangre,   1neptune,  2pumpkin,  3clover,   4denim,    5cocoa,    6cumin,    7berry
 train_quota = 0.8
 def enlarge_lag(to_enlarge, time_window=1):
    # to_enlarge is the data already present, should be a numpy array
    enlarged = []
    for i in range(to_enlarge.shape[0] - time_window + 1):
        new_element = []
        for j in range(time_window):
            new_element.extend(to_enlarge[i + time_window - 1 - j, :])
        enlarged.append(new_element)
    return np.array(enlarged)
 if len(sys.argv) > 1:
    time_window = int(sys.argv[1])
 else:
    time_window = 1
 #time_window = 10
 stock_data = pd.read_pickle("data/MSFT_data.pkl")
 daily_returns = ((stock_data["Close"] - stock_data["Open"]) / stock_data["Open"]).to_numpy()
 prices = stock_data[["Open", "High", "Low", "Close"]].to_numpy()
 volume = stock_data["Volume"].to_numpy()
 minmax_scaler = preprocessing.MinMaxScaler()
 std_scaler = preprocessing.StandardScaler()
 features = np.vstack((daily_returns, volume)).T
 # Necessary for MAs
 part_features = std_scaler.fit_transform(features)
 # Aggiunta EMA
 EMA_20 = stock_data["Close"].ewm(span=20, adjust=False).mean()
 EMA_50 = stock_data["Close"].ewm(span=50, adjust=False).mean()
 EMAs = np.vstack((EMA_20, EMA_50)).T
 norm_EMAs = minmax_scaler.fit_transform(EMAs.reshape(-1, 1)).reshape(-1, 2)
 #EMA_200 = stock_data["Close"].ewm(span=200, adjust=False).mean()
 #EMAs = np.vstack((EMA_20, EMA_50, EMA_200)).T
 #norm_EMAs = minmax_scaler.fit_transform(EMAs.reshape(-1, 1)).reshape(-1, 3)
 norm_features = np.hstack((part_features, norm_EMAs))
 # merge data into 2d numpy array
 Y = np.zeros(features.shape[0] - 1)
 for i in range(Y.size):
    if daily_returns[i+1] >= 0:
        Y[i] = 1
    else:
        Y[i] = 0
 # per quando su usano ma fino a 200
 #Y = Y[49:]
 #Y = Y[199:]
 print(norm_features.shape, Y.shape)
 if time_window > 1:
    norm_features = enlarge_lag(norm_features, time_window)
    Y = Y[time_window-1:]
 train_size = int(norm_features.shape[0] * 0.8)
 X_train = norm_features[:train_size, ]
 Y_train = Y[:train_size]
 X_test = norm_features[train_size:-1, ]
 Y_test = Y[train_size:]
 # Iterations vs Accuracy plot
 #plt.figure()
 #plt.plot(np.arange(0, len(acc_array)) * 100, acc_array)
 #plt.xlabel("Iterations")
 #plt.ylabel("Accuracy")
 #
 ## Iterations vs Loss plot
 #plt.figure()
 #plt.plot(np.arange(0, len(acc_array)) * 100, losses)
 #plt.xlabel("Iterations")
 #plt.ylabel("Losses")
 #
 #plt.show()
 #lets try sklearn
 from sklearn.neural_network import MLPClassifier
 #classifier = LogisticRegression(random_state=0, solver="saga").fit(X_train, Y_train)
 clf = MLPClassifier(hidden_layer_sizes=(20,10,5,2), max_iter=30000, verbose=True).fit(X_train, Y_train)
 train_score = clf.score(X_train, Y_train)
 score = clf.score(X_test, Y_test)
 print("sklearn score, all default: ", score, " train ", train_score)
 with open("plots/data/MLP_20_10_5_2.csv", "a") as f:
    f.write(f"{time_window};{train_score};{score};\n")
--- a/README.md
+++ b/README.md
@ -0,0 +1,7 @@
 # Stock Price Prediction
 This is a simple project of a stock price prediction tool that tests various configurations.
 The final goal is to study performance improvement changing the technology and the methods used.
 Final try
--- a/appunti.md
+++ b/appunti.md
@ -0,0 +1,145 @@
 # Appunti sullo sviluppo del progetto
 Il titolo di riferimento per la prima parte di studio è Microsoft. Perché:
 * molto capitalizzato
 * longevo
 * è un titolo tech ma non subisce dinamiche troppo "strane" rispetto al normale andamento di mercato (es. Tesla)
 Per prima cosa si testa la performance di un modello non trainato, semplice, che prova a predire il segno del ritorno del giorno successivo sulla base del ritorno del giorno precedente (+ segue + e - segue -).
 Si riporta anche un piccolo grafico a barre per avere un'idea della distribuzione dei ritorni.
 winrate detected: 0.47638123852445335
 Primi test con logistic regression, aggiungendo come features giorni passati: lieve increase, troppi giorni porta a overfitting
 provare a effettuare lo stesso test ma aggiungendo qualche metrica (es. moving average) -> C'è un lieve miglioramento
 Nell'MLP, nei nomi dei file di dati, i numeri sono la dimensione degli hidden layer, in ordine di profondità
 Primo test semplice semplice, architettura di seguito:
 Model: "sequential"
 _________________________________________________________________
 Layer (type)                Output Shape              Param #   
 =================================================================
 lstm (LSTM)                 (None, 20, 50)            10400     
 dropout (Dropout)           (None, 20, 50)            0         
 lstm_1 (LSTM)               (None, 20, 50)            20200     
 dropout_1 (Dropout)         (None, 20, 50)            0         
 lstm_2 (LSTM)               (None, 50)                20200     
 dropout_2 (Dropout)         (None, 50)                0         
 dense (Dense)               (None, 1)                 51        
 =================================================================
 Total params: 50,851
 Trainable params: 50,851
 Non-trainable params: 0
 semplici 25 epoche e split 0.8 / 0.2
 il plot che si ottiene è quello
 con dati (win rate sui ritorni):
 tutto il testing set (): 0.4991624790619765
 primi 200 giorni: 0.605
 primi 100 giorni: 0.58
 primi 50 giorni: 0.66
 su tutto ho 
 RMSE:  76.4 (dollari ?)
 MAPE:  21.8 %
 su 300 giorni:
 RMSE su 300 gg:  6.4 $
 MAPE su 300 gg:  2.9 %
 In presentazione metto prima grafico con mape e rmse su tutto facendo considerazioni, poi mi allargo con mape e rmse specifiche + winrate su meno giorni
 nel firts advanced l'architettura è:
 Model: "sequential"
 _________________________________________________________________
 Layer (type)                Output Shape              Param #   
 =================================================================
 lstm (LSTM)                 (None, 10, 10)            480       
 dropout (Dropout)           (None, 10, 10)            0         
 lstm_1 (LSTM)               (None, 10)                840       
 dropout_1 (Dropout)         (None, 10)                0         
 dense (Dense)               (None, 5)                 55        
 dropout_2 (Dropout)         (None, 5)                 0         
 dense_1 (Dense)             (None, 1)                 6         
 =================================================================
 Total params: 1,381
 Trainable params: 1,381
 Non-trainable params: 0
 _________________________________________________________________
 LTSM advanced 2: training data ridotta a 2000 giorni, arch:
 Model: "sequential"
 _________________________________________________________________
 Layer (type)                Output Shape              Param #   
 =================================================================
 lstm (LSTM)                 (None, 10, 20)            1760      
 dropout (Dropout)           (None, 10, 20)            0         
 lstm_1 (LSTM)               (None, 20)                3280      
 dropout_1 (Dropout)         (None, 20)                0         
 dense (Dense)               (None, 5)                 105       
 dropout_2 (Dropout)         (None, 5)                 0         
 dense_1 (Dense)             (None, 1)                 6         
 =================================================================
 Total params: 5,151
 Trainable params: 5,151
 Non-trainable params: 0
 risultati
 RMSE:  10.799429328578809
 MAPE:  3.1894335488381116
 RMSE su 300 gg:  11.607057105021592
 MAPE su 300 gg:  3.591834377775106
 training di 50 epoche
 ma win rate sul ritorno del giorno dopo sempre ~0.5
 Il 3 ha un'architettura molto semplificata e tiene una timewindow di soli 5 gg:
 Model: "sequential"
 _________________________________________________________________
 Layer (type)                Output Shape              Param #   
 =================================================================
 lstm (LSTM)                 (None, 10)                480       
 dropout (Dropout)           (None, 10)                0         
 dense (Dense)               (None, 1)                 11        
 =================================================================
 Total params: 491
 Trainable params: 491
 Non-trainable params: 0
 _________________________________________________________________
 RMSE:  12.955399161548117
 MAPE:  3.7480157718302904
 RMSE su 300 gg:  11.019121338505466
 MAPE su 300 gg:  3.3382726092879706
 non si guadagna molto in winrate
 semilog histo
--- a/autocorr_plot.py
+++ b/autocorr_plot.py
@ -0,0 +1,134 @@
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 import yfinance as yf
 from datetime import datetime
 import os, sys
 from sklearn import preprocessing
 #bodacious colors
 colors=sns.color_palette("rocket", 8)
 #Ram's colors, if desired
 seshadri = ['#c3121e', '#0348a1', '#ffb01c', '#027608', '#0193b0', '#9c5300', '#949c01', '#7104b5']
 #            0sangre,   1neptune,  2pumpkin,  3clover,   4denim,    5cocoa,    6cumin,    7berry
 stock_data = pd.read_pickle("data/MSFT_data.pkl")
 daily_returns = ((stock_data["Close"] - stock_data["Open"]) / stock_data["Open"]).to_numpy() * 100
 prices = stock_data[["Open", "High", "Low", "Close"]].to_numpy()
 volume = stock_data["Volume"].to_numpy()
 minmax_scaler = preprocessing.MinMaxScaler()
 std_scaler = preprocessing.StandardScaler()
 features = np.vstack((daily_returns, volume)).T
 # Scale volume data to obtain better results
 #minmax_scaler = preprocessing.MinMaxScaler()
 #norm_ret = std_scaler.fit_transform(daily_returns.reshape(-1,1)).flatten()
 #norm_vol = minmax_scaler.fit_transform(volume.reshape(-1,1)).flatten()
 #norm_features = np.vstack((norm_ret, norm_vol)).T
 # Solo volumi e ritorni
 #norm_features = std_scaler.fit_transform(features)
 # Aggiunta di prezzi
 #norm_prices = minmax_scaler.fit_transform(prices.reshape(-1, 1)).reshape(-1, 4)
 #norm_ret_and_vol = std_scaler.fit_transform(features)
 #norm_features = np.hstack((norm_ret_and_vol, norm_prices))
 # Necessary for MAs
 part_features = std_scaler.fit_transform(features)
 # Aggiunta SMA
 #SMA_20 = stock_data["Close"].rolling(20).mean().to_numpy()
 #SMA_50 = stock_data["Close"].rolling(50).mean().to_numpy()
 #SMA_200 = stock_data["Close"].rolling(200).mean().to_numpy()
 #SMAs = np.vstack((SMA_20, SMA_50)).T
 #norm_SMAs = minmax_scaler.fit_transform(SMAs[49:, ].reshape(-1, 1)).reshape(-1, 2)
 #norm_features = np.hstack((part_features[49:, ], norm_SMAs))
 #SMAs = np.vstack((SMA_20, SMA_50, SMA_200)).T
 #norm_SMAs = minmax_scaler.fit_transform(SMAs[199:, ].reshape(-1, 1)).reshape(-1, 3)
 #norm_features = np.hstack((part_features[199:, ], norm_SMAs))
 # Aggiunta EMA
 EMA_20 = stock_data["Close"].ewm(span=20, adjust=False).mean()
 EMA_50 = stock_data["Close"].ewm(span=50, adjust=False).mean()
 EMAs = np.vstack((EMA_20, EMA_50)).T
 norm_EMAs = minmax_scaler.fit_transform(EMAs.reshape(-1, 1)).reshape(-1, 2)
 #EMA_200 = stock_data["Close"].ewm(span=200, adjust=False).mean()
 #EMAs = np.vstack((EMA_20, EMA_50, EMA_200)).T
 #norm_EMAs = minmax_scaler.fit_transform(EMAs.reshape(-1, 1)).reshape(-1, 3)
 norm_features = np.hstack((part_features, norm_EMAs))
 dfeat = {"Daily Returns" : norm_features[:,0],
 "Volume" : norm_features[:,1],
 "EMA20" : norm_features[:,2],
 "EMA50" : norm_features[:,3] 
 }
 corr = pd.DataFrame(dfeat).corr()
 fig = plt.figure(1, (11, 10))
 sns.heatmap(corr, vmin=-1, vmax=1, center=0, cmap="mako")
 plt.tick_params(labelsize=14)
 plt.savefig("plots/Correlation_EMAs.png", dpi=300)
 # merge data into 2d numpy array
 Y = np.zeros(features.shape[0] - 1)
 for i in range(Y.size):
    if daily_returns[i+1] >= 0:
        Y[i] = 1
    else:
        Y[i] = 0
 # per quando su usano ma fino a 200
 #Y = Y[49:]
 #Y = Y[199:]
 print(norm_features.shape, Y.shape)
 fig, ax = plt.subplots(figsize=(15,10))
 #plot params
 #plt.xlim([-12,12])
 #plt.ylim([-0.5,16])
 ax.minorticks_on()
 ax.tick_params(labelsize=14)
 ax.tick_params(labelbottom=True, labeltop=False, labelright=False, labelleft=True)
 #xticks = np.arange(0, 1e4,10)
 #yticks = np.arange(0,16.1,4)
 ax.tick_params(direction='in',which='minor', length=5, bottom=True, top=True, left=True, right=True)
 ax.tick_params(direction='in',which='major', length=10, bottom=True, top=True, left=True, right=True)
 #plt.xticks(xticks)
 #plt.yticks(yticks)
 #plt.text(1,325, f'y={Decimal(coefs[3]):.4f}x$^3$+{Decimal(coefs[2]):.2f}x$^2$+{Decimal(coefs[1]):.2f}x+{Decimal(coefs[0]):.1f}',fontsize =13)
 ax.set_xlim([0, 500])
 #ax.set_ylim([-0.5, 0.5])
 pd.plotting.autocorrelation_plot(daily_returns, ax=ax, color=seshadri[0], label="Daily Returns")
 pd.plotting.autocorrelation_plot(np.abs(daily_returns), ax=ax, color=seshadri[1], label="Absolute Daily Returns")
 pd.plotting.autocorrelation_plot(volume, ax=ax, color=seshadri[2], label="Volume")
 ax.grid(False)
 ax.set_xlabel(r'Lag', fontsize=14) 
 ax.set_ylabel(r'Autocorrelation',fontsize=14)  # label the y axis
 ax.legend(fontsize=14, loc="upper right", bbox_to_anchor=(0.99, 0.99))  # add the legend (will default to 'best' location)
 plt.savefig("plots/Autocorrelation_returns_volume_abs.png", dpi=300)
--- a/data/AAPL_data.csv
+++ b/data/AAPL_data.csv
--- a/data/AAPL_data.pkl
+++ b/data/AAPL_data.pkl
--- a/data/IXIC_data.csv
+++ b/data/IXIC_data.csv
--- a/data/IXIC_data.pkl
+++ b/data/IXIC_data.pkl
--- a/data/MSFT_data.csv
+++ b/data/MSFT_data.csv
--- a/data/MSFT_data.pkl
+++ b/data/MSFT_data.pkl
--- a/data/daily_MSFT.csv
+++ b/data/daily_MSFT.csv
--- a/logistic_regression.py
+++ b/logistic_regression.py
@ -0,0 +1,228 @@
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 import yfinance as yf
 from datetime import datetime
 import os, sys
 from sklearn import preprocessing
 #bodacious colors
 colors=sns.color_palette("rocket", 8)
 #Ram's colors, if desired
 seshadri = ['#c3121e', '#0348a1', '#ffb01c', '#027608', '#0193b0', '#9c5300', '#949c01', '#7104b5']
 #            0sangre,   1neptune,  2pumpkin,  3clover,   4denim,    5cocoa,    6cumin,    7berry
 train_quota = 0.8
 def enlarge_lag(to_enlarge, time_window=1):
    # to_enlarge is the data already present, should be a numpy array
    enlarged = []
    for i in range(to_enlarge.shape[0] - time_window + 1):
        new_element = []
        for j in range(time_window):
            new_element.extend(to_enlarge[i + time_window - 1 - j, :])
        enlarged.append(new_element)
    return np.array(enlarged)
 def sigmoid(z):
    return 1 / (1 + np.exp(-z))
 def logreg_inference(x, w, b):
    z = (x @ w) + b
    p = sigmoid(z)
    return p
 def cross_entropy(P, Y):
    return (-Y * np.log(P) - (1 - Y) * np.log(1 - P)).mean()
 def logreg_train(X, Y, lambda_, lr = 1e-4, steps=100000):
    # The training samples are defined as such (each row of X is a sample):
    # X[0, :] -> Y[0]
    # X[1, :] -> Y[1]
    m, n = X.shape
    # Initial values for the parameters
    w = np.zeros(n)
    b = 0
    # Initial values for the "precedent loss" and "convergence" variables, used to check convergence
    prec_loss = 0
    convergence = 0
    for step in range(steps):
        P = logreg_inference(X, w, b)
        loss = cross_entropy(P, Y)
        if step % 1000 == 0:
            print(step, loss)
            # Difference between "precedent loss" and "current loss"
            diff = np.absolute(prec_loss - loss)
            prec_loss = loss
            if diff < 0.00001:
                # If convergence is reached, the algorithm is stopped
                convergence = step
                break
        # Derivative of the loss function with respect to bias
        grad_b = (P - Y).mean()
        # Gradient of the loss function with respect to weights
        grad_w = (X.T @ (P - Y)) / m
        w -= lr * grad_w
        b -= lr * grad_b
        # Every 100 iteration the values of accuracy and loss are saved for plotting
        if step%100 == 0:
            Yhat = (P > 0.5)
            acc_array.append((Y == Yhat).mean() * 100)
            losses.append(loss)
    # Print the iterations needed for convergence before returning
    print("Convergence = ", convergence)
    return w, b
 if len(sys.argv) > 1:
    time_window = int(sys.argv[1])
 else:
    time_window = 1
 #time_window = 10
 stock_data = pd.read_pickle("data/MSFT_data.pkl")
 daily_returns = ((stock_data["Close"] - stock_data["Open"]) / stock_data["Open"]).to_numpy()
 prices = stock_data[["Open", "High", "Low", "Close"]].to_numpy()
 volume = stock_data["Volume"].to_numpy()
 minmax_scaler = preprocessing.MinMaxScaler()
 std_scaler = preprocessing.StandardScaler()
 features = np.vstack((daily_returns, volume)).T
 # Scale volume data to obtain better results
 #minmax_scaler = preprocessing.MinMaxScaler()
 #norm_ret = std_scaler.fit_transform(daily_returns.reshape(-1,1)).flatten()
 #norm_vol = minmax_scaler.fit_transform(volume.reshape(-1,1)).flatten()
 #norm_features = np.vstack((norm_ret, norm_vol)).T
 # Solo volumi e ritorni
 #norm_features = std_scaler.fit_transform(features)
 # Aggiunta di prezzi
 #norm_prices = minmax_scaler.fit_transform(prices.reshape(-1, 1)).reshape(-1, 4)
 #norm_ret_and_vol = std_scaler.fit_transform(features)
 #norm_features = np.hstack((norm_ret_and_vol, norm_prices))
 # Necessary for MAs
 part_features = std_scaler.fit_transform(features)
 # Aggiunta SMA
 #SMA_20 = stock_data["Close"].rolling(20).mean().to_numpy()
 #SMA_50 = stock_data["Close"].rolling(50).mean().to_numpy()
 #SMA_200 = stock_data["Close"].rolling(200).mean().to_numpy()
 #SMAs = np.vstack((SMA_20, SMA_50)).T
 #norm_SMAs = minmax_scaler.fit_transform(SMAs[49:, ].reshape(-1, 1)).reshape(-1, 2)
 #norm_features = np.hstack((part_features[49:, ], norm_SMAs))
 #SMAs = np.vstack((SMA_20, SMA_50, SMA_200)).T
 #norm_SMAs = minmax_scaler.fit_transform(SMAs[199:, ].reshape(-1, 1)).reshape(-1, 3)
 #norm_features = np.hstack((part_features[199:, ], norm_SMAs))
 # Aggiunta EMA
 EMA_20 = stock_data["Close"].ewm(span=20, adjust=False).mean()
 EMA_50 = stock_data["Close"].ewm(span=50, adjust=False).mean()
 EMAs = np.vstack((EMA_20, EMA_50)).T
 norm_EMAs = minmax_scaler.fit_transform(EMAs.reshape(-1, 1)).reshape(-1, 2)
 #EMA_200 = stock_data["Close"].ewm(span=200, adjust=False).mean()
 #EMAs = np.vstack((EMA_20, EMA_50, EMA_200)).T
 #norm_EMAs = minmax_scaler.fit_transform(EMAs.reshape(-1, 1)).reshape(-1, 3)
 norm_features = np.hstack((part_features, norm_EMAs))
 # merge data into 2d numpy array
 Y = np.zeros(features.shape[0] - 1)
 for i in range(Y.size):
    if daily_returns[i+1] >= 0:
        Y[i] = 1
    else:
        Y[i] = 0
 # per quando su usano ma fino a 200
 #Y = Y[49:]
 #Y = Y[199:]
 print(norm_features.shape, Y.shape)
 if time_window > 1:
    norm_features = enlarge_lag(norm_features, time_window)
    Y = Y[time_window-1:]
 train_size = int(norm_features.shape[0] * 0.8)
 X_train = norm_features[:train_size, ]
 Y_train = Y[:train_size]
 X_test = norm_features[train_size:-1, ]
 Y_test = Y[train_size:]
 #if time_window > 1:
 #    X_train = enlarge_lag(X_train)
 #    Y_train = Y_train[time_window-1:]
 #
 #    X_test = enlarge_lag(X_test)
 #    Y_test = Y_test[time_window-1:]
 # Lists to save accuracy and loss
 acc_array = []
 losses = []
 w, b = logreg_train(X_train, Y_train, 0.0, 1e-3, 1000000)
 print("Weights: ", w)
 print("Bias: ", b)
 # Iterations vs Accuracy plot
 #plt.figure()
 #plt.plot(np.arange(0, len(acc_array)) * 100, acc_array)
 #plt.xlabel("Iterations")
 #plt.ylabel("Accuracy")
 #
 ## Iterations vs Loss plot
 #plt.figure()
 #plt.plot(np.arange(0, len(acc_array)) * 100, losses)
 #plt.xlabel("Iterations")
 #plt.ylabel("Losses")
 #
 #plt.show()
 # Training accuracy of the model, is the last value recorded in the array
 print("Training Acc: ", acc_array[-1])
 P_test = logreg_inference(X_test, w, b)
 Yhat_test = (P_test > 0.5)
 accuracy_test = (Y_test == Yhat_test).mean()
 print("Test accuracy: ", 100*accuracy_test)
 #lets try sklearn
 #from sklearn.linear_model import LogisticRegression
 #classifier = LogisticRegression(random_state=0, solver="saga").fit(X_train, Y_train)
 #score = classifier.score(X_test, Y_test)
 #print("sklearn score, all default: ", score)
 with open("plots/data/logistic_regression_EMA_20_50.csv", "a") as f:
    f.write(f"{time_window};{acc_array[-1]};{accuracy_test};\n")
--- a/logistic_regression_enlarge_only_rets.py
+++ b/logistic_regression_enlarge_only_rets.py
@ -0,0 +1,203 @@
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 import yfinance as yf
 from datetime import datetime
 import os, sys
 from sklearn import preprocessing
 #bodacious colors
 colors=sns.color_palette("rocket", 8)
 #Ram's colors, if desired
 seshadri = ['#c3121e', '#0348a1', '#ffb01c', '#027608', '#0193b0', '#9c5300', '#949c01', '#7104b5']
 #            0sangre,   1neptune,  2pumpkin,  3clover,   4denim,    5cocoa,    6cumin,    7berry
 train_quota = 0.8
 def enlarge_lag(to_enlarge, time_window=1):
    # to_enlarge is the data already present, should be a numpy array
    enlarged = []
    for i in range(to_enlarge.shape[0] - time_window + 1):
        new_element = []
        for j in range(time_window):
            new_element.extend(to_enlarge[i + time_window - 1 - j, :])
        enlarged.append(new_element)
    return np.array(enlarged)
 def sigmoid(z):
    return 1 / (1 + np.exp(-z))
 def logreg_inference(x, w, b):
    z = (x @ w) + b
    p = sigmoid(z)
    return p
 def cross_entropy(P, Y):
    return (-Y * np.log(P) - (1 - Y) * np.log(1 - P)).mean()
 def logreg_train(X, Y, lambda_, lr = 1e-4, steps=100000):
    # The training samples are defined as such (each row of X is a sample):
    # X[0, :] -> Y[0]
    # X[1, :] -> Y[1]
    m, n = X.shape
    # Initial values for the parameters
    w = np.zeros(n)
    b = 0
    # Initial values for the "precedent loss" and "convergence" variables, used to check convergence
    prec_loss = 0
    convergence = 0
    for step in range(steps):
        P = logreg_inference(X, w, b)
        loss = cross_entropy(P, Y)
        if step % 1000 == 0:
            print(step, loss)
            # Difference between "precedent loss" and "current loss"
            diff = np.absolute(prec_loss - loss)
            prec_loss = loss
            if diff < 0.00001:
                # If convergence is reached, the algorithm is stopped
                convergence = step
                break
        # Derivative of the loss function with respect to bias
        grad_b = (P - Y).mean()
        # Gradient of the loss function with respect to weights
        grad_w = (X.T @ (P - Y)) / m
        w -= lr * grad_w
        b -= lr * grad_b
        # Every 100 iteration the values of accuracy and loss are saved for plotting
        if step%100 == 0:
            Yhat = (P > 0.5)
            acc_array.append((Y == Yhat).mean() * 100)
            losses.append(loss)
    # Print the iterations needed for convergence before returning
    print("Convergence = ", convergence)
    return w, b
 if len(sys.argv) > 1:
    time_window = int(sys.argv[1])
 else:
    time_window = 1
 #time_window = 10
 stock_data = pd.read_pickle("data/MSFT_data.pkl")
 daily_returns = ((stock_data["Close"] - stock_data["Open"]) / stock_data["Open"]).to_numpy()
 prices = stock_data[["Open", "High", "Low", "Close"]].to_numpy()
 volume = stock_data["Volume"].to_numpy()
 minmax_scaler = preprocessing.MinMaxScaler()
 std_scaler = preprocessing.StandardScaler()
 features = np.vstack((daily_returns, volume)).T
 # Necessary for MAs
 part_features = std_scaler.fit_transform(features)
 # merge data into 2d numpy array
 Y = np.zeros(features.shape[0] - 1)
 for i in range(Y.size):
    if daily_returns[i+1] >= 0:
        Y[i] = 1
    else:
        Y[i] = 0
 import copy
 if time_window > 1:
    large_rets = enlarge_lag(part_features[:, 0].reshape(-1, 1), time_window)
    Y = Y[time_window-1:]
 else:
    large_rets = copy.deepcopy(part_features[:, 0].reshape(-1, 1))
 part_features = np.hstack((large_rets, part_features[time_window-1:, 1].reshape(-1, 1)))
 # Aggiunta EMA
 EMA_20 = stock_data["Close"].ewm(span=20, adjust=False).mean()
 EMA_50 = stock_data["Close"].ewm(span=50, adjust=False).mean()
 EMAs = np.vstack((EMA_20, EMA_50)).T
 norm_EMAs = minmax_scaler.fit_transform(EMAs.reshape(-1, 1)).reshape(-1, 2)
 norm_features = np.hstack((part_features, norm_EMAs[time_window-1:,]))
 print(norm_features.shape, Y.shape)
 train_size = int(norm_features.shape[0] * 0.8)
 X_train = norm_features[:train_size, ]
 Y_train = Y[:train_size]
 X_test = norm_features[train_size:-1, ]
 Y_test = Y[train_size:]
 #if time_window > 1:
 #    X_train = enlarge_lag(X_train)
 #    Y_train = Y_train[time_window-1:]
 #
 #    X_test = enlarge_lag(X_test)
 #    Y_test = Y_test[time_window-1:]
 # Lists to save accuracy and loss
 acc_array = []
 losses = []
 w, b = logreg_train(X_train, Y_train, 0.0, 1e-3, 1000000)
 print("Weights: ", w)
 print("Bias: ", b)
 # Iterations vs Accuracy plot
 #plt.figure()
 #plt.plot(np.arange(0, len(acc_array)) * 100, acc_array)
 #plt.xlabel("Iterations")
 #plt.ylabel("Accuracy")
 #
 ## Iterations vs Loss plot
 #plt.figure()
 #plt.plot(np.arange(0, len(acc_array)) * 100, losses)
 #plt.xlabel("Iterations")
 #plt.ylabel("Losses")
 #
 #plt.show()
 # Training accuracy of the model, is the last value recorded in the array
 print("Training Acc: ", acc_array[-1])
 P_test = logreg_inference(X_test, w, b)
 Yhat_test = (P_test > 0.5)
 accuracy_test = (Y_test == Yhat_test).mean()
 print("Test accuracy: ", 100*accuracy_test)
 #lets try sklearn
 #from sklearn.linear_model import LogisticRegression
 #classifier = LogisticRegression(random_state=0, solver="saga").fit(X_train, Y_train)
 #score = classifier.score(X_test, Y_test)
 #print("sklearn score, all default: ", score)
 with open("plots/data/logistic_regression_EMA_20_50_only_daily_enlarged.csv", "a") as f:
    f.write(f"{time_window};{acc_array[-1]};{accuracy_test};\n")
--- a/logistic_regression_only_returns.py
+++ b/logistic_regression_only_returns.py
@ -0,0 +1,170 @@
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 import yfinance as yf
 from datetime import datetime
 import os, sys
 from sklearn import preprocessing
 #bodacious colors
 colors=sns.color_palette("rocket", 8)
 #Ram's colors, if desired
 seshadri = ['#c3121e', '#0348a1', '#ffb01c', '#027608', '#0193b0', '#9c5300', '#949c01', '#7104b5']
 #            0sangre,   1neptune,  2pumpkin,  3clover,   4denim,    5cocoa,    6cumin,    7berry
 train_quota = 0.8
 def enlarge_lag(to_enlarge, time_window=1):
    # to_enlarge is the data already present, should be a numpy array
    enlarged = []
    for i in range(to_enlarge.shape[0] - time_window + 1):
        new_element = []
        for j in range(time_window):
            new_element.extend(to_enlarge[i + time_window - 1 - j, :])
        enlarged.append(new_element)
    return np.array(enlarged)
 def sigmoid(z):
    return 1 / (1 + np.exp(-z))
 def logreg_inference(x, w, b):
    z = (x @ w) + b
    p = sigmoid(z)
    return p
 def cross_entropy(P, Y):
    return (-Y * np.log(P) - (1 - Y) * np.log(1 - P)).mean()
 def logreg_train(X, Y, lambda_, lr = 1e-4, steps=100000):
    # The training samples are defined as such (each row of X is a sample):
    # X[0, :] -> Y[0]
    # X[1, :] -> Y[1]
    m, n = X.shape
    # Initial values for the parameters
    w = np.zeros(n)
    b = 0
    # Initial values for the "precedent loss" and "convergence" variables, used to check convergence
    prec_loss = 0
    convergence = 0
    for step in range(steps):
        P = logreg_inference(X, w, b)
        loss = cross_entropy(P, Y)
        if step % 1000 == 0:
            print(step, loss)
            # Difference between "precedent loss" and "current loss"
            diff = np.absolute(prec_loss - loss)
            prec_loss = loss
            if diff < 0.00001:
                # If convergence is reached, the algorithm is stopped
                convergence = step
                break
        # Derivative of the loss function with respect to bias
        grad_b = (P - Y).mean()
        # Gradient of the loss function with respect to weights
        grad_w = (X.T @ (P - Y)) / m
        w -= lr * grad_w
        b -= lr * grad_b
        # Every 100 iteration the values of accuracy and loss are saved for plotting
        if step%100 == 0:
            Yhat = (P > 0.5)
            acc_array.append((Y == Yhat).mean() * 100)
            losses.append(loss)
    # Print the iterations needed for convergence before returning
    print("Convergence = ", convergence)
    return w, b
 if len(sys.argv) > 1:
    time_window = int(sys.argv[1])
 else:
    time_window = 1
 #time_window = 10
 stock_data = pd.read_pickle("data/MSFT_data.pkl")
 daily_returns = ((stock_data["Close"] - stock_data["Open"]) / stock_data["Open"]).to_numpy().reshape(-1,1)
 # merge data into 2d numpy array
 Y = np.zeros(daily_returns.shape[0] - 1)
 print(daily_returns.shape, Y.shape)
 for i in range(Y.size):
    if daily_returns[i+1] >= 0:
        Y[i] = 1
    else:
        Y[i] = 0
 import copy
 norm_features = copy.deepcopy(daily_returns)
 if time_window > 1:
    norm_features = enlarge_lag(norm_features, time_window)
    Y = Y[time_window-1:]
 train_size = int(norm_features.shape[0] * 0.8)
 X_train = norm_features[:train_size, ]
 Y_train = Y[:train_size]
 X_test = norm_features[train_size:-1, ]
 Y_test = Y[train_size:]
 # Lists to save accuracy and loss
 acc_array = []
 losses = []
 w, b = logreg_train(X_train, Y_train, 0.0, 1e-3, 1000000)
 print("Weights: ", w)
 print("Bias: ", b)
 # Iterations vs Accuracy plot
 #plt.figure()
 #plt.plot(np.arange(0, len(acc_array)) * 100, acc_array)
 #plt.xlabel("Iterations")
 #plt.ylabel("Accuracy")
 #
 ## Iterations vs Loss plot
 #plt.figure()
 #plt.plot(np.arange(0, len(acc_array)) * 100, losses)
 #plt.xlabel("Iterations")
 #plt.ylabel("Losses")
 #
 #plt.show()
 # Training accuracy of the model, is the last value recorded in the array
 print("Training Acc: ", acc_array[-1])
 P_test = logreg_inference(X_test, w, b)
 Yhat_test = (P_test > 0.5)
 accuracy_test = (Y_test == Yhat_test).mean()
 print("Test accuracy: ", 100*accuracy_test)
 #lets try sklearn
 #from sklearn.linear_model import LogisticRegression
 #classifier = LogisticRegression(random_state=0, solver="saga").fit(X_train, Y_train)
 #score = classifier.score(X_test, Y_test)
 #print("sklearn score, all default: ", score)
 with open("plots/data/logistic_regression_only_rets.csv", "a") as f:
    f.write(f"{time_window};{acc_array[-1]};{accuracy_test};\n")
--- a/logistic_regression_run_script.sh
+++ b/logistic_regression_run_script.sh
@ -0,0 +1,8 @@
 #!/bin/bash
 for i in $(seq 1 50);
 do
    echo "Running with time window $i"
    python3 logistic_regression_enlarge_only_rets.py $i
 done
--- a/mlp_run_script.sh
+++ b/mlp_run_script.sh
@ -0,0 +1,8 @@
 #!/bin/bash
 for i in $(seq 1 50);
 do
    echo "Running with time window $i"
    python3 MultiLayer_Perceptron.py $i
 done
--- a/plots/Autocorrelation_returns_volume.png
+++ b/plots/Autocorrelation_returns_volume.png
--- a/plots/Autocorrelation_returns_volume_abs.png
+++ b/plots/Autocorrelation_returns_volume_abs.png
--- a/plots/Correlation_EMAs.png
+++ b/plots/Correlation_EMAs.png
--- a/plots/First_Attempt_LSTM.png
+++ b/plots/First_Attempt_LSTM.png
--- a/plots/First_Attempt_LSTM_2.png
+++ b/plots/First_Attempt_LSTM_2.png
--- a/plots/LSTM_advanced_1.png
+++ b/plots/LSTM_advanced_1.png
--- a/plots/LSTM_advanced_2.png
+++ b/plots/LSTM_advanced_2.png
--- a/plots/LSTM_advanced_3.png
+++ b/plots/LSTM_advanced_3.png
--- a/plots/LSTM_advanced_4.png
+++ b/plots/LSTM_advanced_4.png
--- a/plots/LSTM_advanced_rets_1.png
+++ b/plots/LSTM_advanced_rets_1.png
--- a/plots/MLP_20_10_5_2.png
+++ b/plots/MLP_20_10_5_2.png
--- a/plots/MLP_30_20_20_10.png
+++ b/plots/MLP_30_20_20_10.png
--- a/plots/MLP_50_20.png
+++ b/plots/MLP_50_20.png
--- a/plots/MSFT_daily_occs.png
+++ b/plots/MSFT_daily_occs.png
--- a/plots/MSFT_daily_occs_semilogx.png
+++ b/plots/MSFT_daily_occs_semilogx.png
--- a/plots/data/MLP_20_10_5_2.csv
+++ b/plots/data/MLP_20_10_5_2.csv
@ -0,0 +1,51 @@
 time_window;training_accuracy;testing_accuracy;
 1;0.5157521385353641;0.5325542570951586;
 2;0.5070951585976627;0.5317195325542571;
 3;0.6231218697829716;0.48955722639933164;
 4;0.6103109997912753;0.4653299916457811;
 5;0.6471816283924844;0.4903926482873851;
 6;0.6604719148047609;0.4928989139515455;
 7;0.6641604010025063;0.5137844611528822;
 8;0.7013366750208856;0.520066889632107;
 9;0.6897848339252142;0.48327759197324416;
 10;0.6136648558295027;0.46321070234113715;
 11;0.6587251828631139;0.4866220735785953;
 12;0.7215719063545151;0.5259197324414716;
 13;0.6684782608695652;0.4811715481171548;
 14;0.748693288730922;0.5188284518828452;
 15;0.740694270179841;0.5154811715481171;
 16;0.6801924283622673;0.47447698744769873;
 17;0.7684100418410041;0.5238493723849372;
 18;0.7566945606694561;0.5008375209380235;
 19;0.7928436911487758;0.5309882747068677;
 20;0.8139388865634156;0.49413735343383586;
 21;0.6206824366757379;0.5301507537688442;
 22;0.7889447236180904;0.507537688442211;
 23;0.6379815745393634;0.4660519698239732;
 24;0.6751832460732984;0.5071248952221291;
 25;0.6258902387934646;0.46437552388935455;
 26;0.7301487534045673;0.49706621961441744;
 27;0.6787510477787091;0.46689019279128247;
 28;0.8658843252305113;0.47651006711409394;
 29;0.7048836721861245;0.535234899328859;
 30;0.8633123689727463;0.5075503355704698;
 31;0.8228140071293772;0.5067114093959731;
 32;0.8181627516778524;0.5058724832214765;
 33;0.6447147651006712;0.5104953820319059;
 34;0.8833647996643591;0.4802686817800168;
 35;0.8063365505665128;0.5071368597816961;
 36;0.8818467995802728;0.5146935348446684;
 37;0.6358102434928632;0.5331654072208228;
 38;0.8717464315701091;0.5117647058823529;
 39;0.9048918748687802;0.47394957983193275;
 40;0.6698866022679546;0.5319327731092437;
 41;0.6784289014912833;0.4689075630252101;
 42;0.7008403361344537;0.5210084033613446;
 43;0.8613445378151261;0.49705634987384356;
 44;0.7066610632485817;0.5021026072329688;
 45;0.6353509878100042;0.5365853658536586;
 46;0.6771074206432626;0.5256518082422204;
 47;0.5971404541631623;0.47434819175777965;
 48;0.8092935239697224;0.4983164983164983;
 49;0.9335436382754995;0.4983164983164983;
 50;0.9091291543962978;0.5244107744107744;
--- a/plots/data/MLP_30_20_20_10.csv
+++ b/plots/data/MLP_30_20_20_10.csv
@ -0,0 +1,51 @@
 time_window;training_accuracy;testing_accuracy;
 1;0.547673690799082;0.5217028380634391;
 2;0.6400250417362271;0.508347245409015;
 3;0.715567612687813;0.5472013366750209;
 4;0.7424337299102484;0.4803675856307435;
 5;0.7749478079331942;0.48370927318295737;
 6;0.8275214032157027;0.4954051796157059;
 7;0.8105680868838764;0.4928989139515455;
 8;0.8335421888053467;0.504180602006689;
 9;0.8272404428660957;0.49414715719063546;
 10;0.8763058921855411;0.4891304347826087;
 11;0.864158829676071;0.5150501672240803;
 12;0.8858695652173914;0.5066889632107023;
 13;0.9015468227424749;0.5213389121338912;
 14;0.8873092201547146;0.5129707112970712;
 15;0.90987034713509;0.5288702928870292;
 16;0.9115247856097051;0.5171548117154812;
 17;0.9156903765690376;0.47280334728033474;
 18;0.9217573221757323;0.5117252931323283;
 19;0.9378531073446328;0.48576214405360135;
 20;0.9158643784010047;0.474036850921273;
 21;0.9522712999790663;0.4949748743718593;
 22;0.9711055276381909;0.5293132328308208;
 23;0.9384422110552764;0.5037720033528919;
 24;0.9759162303664921;0.5155071248952221;
 25;0.9733975701717638;0.5146689019279128;
 26;0.9664781060129898;0.48365465213746855;
 27;0.972338642078793;0.5037720033528919;
 28;0.9867979882648784;0.4714765100671141;
 29;0.9811360301823517;0.4790268456375839;
 30;0.9656184486373166;0.5201342281879194;
 31;0.9746278045711889;0.4983221476510067;
 32;0.9351929530201343;0.5192953020134228;
 33;0.9729446308724832;0.4903442485306465;
 34;0.9815397524648626;0.48446683459277917;
 35;0.9326479227864037;0.5088161209068011;
 36;0.985099685204617;0.4945424013434089;
 37;0.9685138539042821;0.5155331654072208;
 38;0.9901343408900084;0.4613445378151261;
 39;0.9494016376233466;0.5042016806722689;
 40;0.9647207055858883;0.5100840336134453;
 41;0.9798361688720857;0.5;
 42;0.992436974789916;0.5058823529411764;
 43;0.9897058823529412;0.49032800672834315;
 44;0.9815087203193948;0.5172413793103449;
 45;0.9836065573770492;0.5029436501261564;
 46;0.9882278747109523;0.496215306980656;
 47;0.9960050462573591;0.5273338940285954;
 48;1.0;0.5025252525252525;
 49;0.9707676130389065;0.49326599326599324;
 50;0.9728649558266723;0.4941077441077441;
--- a/plots/data/MLP_50_20.csv
+++ b/plots/data/MLP_50_20.csv
@ -0,0 +1,51 @@
 time_window;training_accuracy;testing_accuracy;
 1;0.5366158981848529;0.5217028380634391;
 2;0.5899415692821369;0.5350584307178631;
 3;0.6750834724540902;0.5087719298245614;
 4;0.7261532039240242;0.48120300751879697;
 5;0.7701461377870563;0.49958228905597324;
 6;0.7584046773856755;0.5087719298245614;
 7;0.7644110275689223;0.5430242272347535;
 8;0.8398078529657477;0.49414715719063546;
 9;0.8748694380614164;0.5016722408026756;
 10;0.9352277475971584;0.47240802675585286;
 11;0.9264367816091954;0.4807692307692308;
 12;0.9423076923076923;0.520066889632107;
 13;0.9412625418060201;0.4794979079497908;
 14;0.9272423165377378;0.507949790794979;
 15;0.958594730238394;0.5196652719665272;
 16;0.9811754862999372;0.497071129707113;
 17;0.992887029288703;0.5263598326359833;
 18;0.9947698744769874;0.509212730318258;
 19;0.9922577945176815;0.49581239530988275;
 20;0.9945583926329008;0.525963149078727;
 21;0.9920452166631777;0.509212730318258;
 22;0.9972780569514238;0.5117252931323283;
 23;0.9886934673366834;0.49958088851634536;
 24;0.9912041884816754;0.5046102263202011;
 25;0.9981147884373691;0.509639564124057;
 26;0.9974858579509742;0.5004191114836547;
 27;0.9945515507124896;0.5297569153394803;
 28;1.0;0.5033557046979866;
 29;0.9997904003353595;0.5260067114093959;
 30;0.99958071278826;0.5243288590604027;
 31;1.0;0.47818791946308725;
 32;1.0;0.5151006711409396;
 33;0.9991610738255033;0.5264483627204031;
 34;1.0;0.4979009235936188;
 35;0.9987410826689047;0.5029387069689337;
 36;1.0;0.5188916876574308;
 37;1.0;0.4869857262804366;
 38;1.0;0.49411764705882355;
 39;1.0;0.5117647058823529;
 40;1.0;0.5243697478991597;
 41;1.0;0.5184873949579832;
 42;0.9997899159663866;0.49747899159663866;
 43;1.0;0.5063078216989066;
 44;1.0;0.48107653490328006;
 45;1.0;0.4920100925147183;
 46;1.0;0.5088309503784693;
 47;0.9997897392767031;0.4953742640874685;
 48;1.0;0.515993265993266;
 49;1.0;0.4772727272727273;
 50;1.0;0.5067340067340067;
--- a/plots/data/logistic_regression.csv
+++ b/plots/data/logistic_regression.csv
@ -0,0 +1,51 @@
 time_window;training_accuracy;testing_accuracy;
 1;52.075944085124135;0.5367278797996661;
 2;51.81552587646077;0.5392320534223706;
 3;51.982470784641066;0.5380116959064327;
 4;52.118555625130455;0.545530492898914;
 5;52.4008350730689;0.5396825396825397;
 6;52.41177698893297;0.5338345864661654;
 7;52.903091060985794;0.5405179615705932;
 8;52.75689223057645;0.5392976588628763;
 9;53.45728013369543;0.540133779264214;
 10;53.5311324697033;0.5409698996655519;
 11;53.45872518286311;0.5384615384615384;
 12;54.117892976588635;0.5351170568561873;
 13;54.0133779264214;0.5364016736401673;
 14;54.129207610286436;0.5338912133891214;
 15;53.95232120451694;0.5364016736401673;
 16;54.1518510771805;0.5338912133891214;
 17;54.16317991631799;0.5372384937238494;
 18;53.80753138075314;0.533500837520938;
 19;53.65139150449885;0.5343383584589615;
 20;54.227710339054;0.5284757118927973;
 21;54.30186309399204;0.5309882747068677;
 22;54.25041876046901;0.5293132328308208;
 23;54.14572864321608;0.5305951383067896;
 24;54.575916230366495;0.5322715842414082;
 25;54.18935902806871;0.5305951383067896;
 26;54.03310287031218;0.5314333612740989;
 27;54.37971500419112;0.5255658005029338;
 28;54.715004191114836;0.5285234899328859;
 29;54.51687277300357;0.5310402684563759;
 30;54.75890985324947;0.5302013422818792;
 31;54.41392325435102;0.5293624161073825;
 32;55.39010067114094;0.5209731543624161;
 33;55.45302013422819;0.5256087321578505;
 34;55.67442836165303;0.5222502099076406;
 35;55.686109945446916;0.5155331654072208;
 36;56.222455403987404;0.5205709487825357;
 37;56.549118387909324;0.5163727959697733;
 38;56.27623845507976;0.519327731092437;
 39;56.0781020365316;0.5134453781512605;
 40;56.425871482570344;0.5134453781512605;
 41;56.2906952320941;0.5117647058823529;
 42;56.596638655462186;0.5134453781512605;
 43;56.84873949579832;0.5138772077375946;
 44;56.81865938222316;0.5096719932716569;
 45;56.36822194199244;0.5105130361648444;
 46;56.54824469203279;0.511354079058032;
 47;56.53910849453322;0.511354079058032;
 48;56.64423885618166;0.5126262626262627;
 49;56.88748685594112;0.5126262626262627;
 50;56.710138830458554;0.5058922558922558;
--- a/plots/data/logistic_regression_EMA.csv
+++ b/plots/data/logistic_regression_EMA.csv
@ -0,0 +1,51 @@
 time_window;training_accuracy;testing_accuracy;
 1;51.99248904652618;0.5375626043405676;
 2;51.77378964941569;0.5375626043405676;
 3;51.982470784641066;0.5396825396825397;
 4;51.99332080985181;0.5405179615705932;
 5;52.35908141962422;0.5355054302422724;
 6;52.59970766339528;0.5304928989139516;
 7;52.86131996658312;0.5388471177944862;
 8;52.673350041771094;0.5426421404682275;
 9;53.39461040317527;0.5418060200668896;
 10;53.51023819473464;0.540133779264214;
 11;53.396029258098224;0.544314381270903;
 12;54.03428093645485;0.5409698996655519;
 13;53.992474916387955;0.5414225941422595;
 14;54.10830022998119;0.5430962343096234;
 15;54.01505646173149;0.5422594142259414;
 16;54.17276720351391;0.5422594142259414;
 17;54.16317991631799;0.5405857740585774;
 18;53.661087866108794;0.5435510887772195;
 19;53.7769407825905;0.5452261306532663;
 20;54.39514441188782;0.541038525963149;
 21;54.17626125183169;0.5385259631490787;
 22;54.10385259631491;0.5402010050251256;
 23;54.29229480737019;0.5406538139145013;
 24;54.51308900523561;0.5389773679798826;
 25;54.18935902806871;0.539815590947192;
 26;53.86549340037712;0.5406538139145013;
 27;54.48449287510477;0.539815590947192;
 28;54.882648784576695;0.5461409395973155;
 29;54.47495284007545;0.5394295302013423;
 30;54.67505241090147;0.5461409395973155;
 31;54.246173201929125;0.5411073825503355;
 32;55.45302013422819;0.5444630872483222;
 33;55.369127516778526;0.5373635600335852;
 34;55.695405915670236;0.5197313182199832;
 35;55.74905581200168;0.5214105793450882;
 36;56.306400839454355;0.5289672544080605;
 37;56.42317380352645;0.5214105793450882;
 38;56.507136859781696;0.5176470588235295;
 39;56.330044089859335;0.5176470588235295;
 40;56.32087358252835;0.5260504201680672;
 41;56.20667926906112;0.5235294117647059;
 42;56.57563025210084;0.5184873949579832;
 43;56.72268907563025;0.5180824222035324;
 44;56.881697835679766;0.5147182506307821;
 45;56.32618747372846;0.5088309503784693;
 46;56.443136430523445;0.5088309503784693;
 47;56.53910849453322;0.5046257359125316;
 48;56.72834314550042;0.5075757575757576;
 49;56.92954784437434;0.5084175084175084;
 50;56.62599915860328;0.5092592592592593;
--- a/plots/data/logistic_regression_EMA_20_50.csv
+++ b/plots/data/logistic_regression_EMA_20_50.csv
@ -0,0 +1,51 @@
 time_window;training_accuracy;testing_accuracy;
 1;52.05508032547465;0.5392320534223706;
 2;51.794657762938236;0.5383973288814691;
 3;52.00333889816361;0.5396825396825397;
 4;52.01419327906491;0.5421888053467001;
 5;52.4008350730689;0.5371762740183793;
 6;52.516182919189816;0.531328320802005;
 7;52.88220551378446;0.5363408521303258;
 8;52.694235588972425;0.5426421404682275;
 9;53.45728013369543;0.5409698996655519;
 10;53.5311324697033;0.540133779264214;
 11;53.37513061650993;0.5418060200668896;
 12;54.05518394648829;0.540133779264214;
 13;53.90886287625418;0.5414225941422595;
 14;54.10830022998119;0.5414225941422595;
 15;53.97323295692179;0.5414225941422595;
 16;54.089102698180305;0.5405857740585774;
 17;54.121338912133886;0.5422594142259414;
 18;53.74476987447699;0.5393634840871022;
 19;53.75601590290856;0.542713567839196;
 20;54.41607367099205;0.5393634840871022;
 21;54.13439397111157;0.5385259631490787;
 22;54.166666666666664;0.5393634840871022;
 23;54.29229480737019;0.5381391450125733;
 24;54.51308900523561;0.5381391450125733;
 25;54.16841223292836;0.5406538139145013;
 26;53.90739576786089;0.5414920368818106;
 27;54.505448449287506;0.5389773679798826;
 28;54.84073763621124;0.5444630872483222;
 29;54.495912806539515;0.5369127516778524;
 30;54.67505241090147;0.5444630872483222;
 31;54.2881107150346;0.5394295302013423;
 32;55.369127516778526;0.5419463087248322;
 33;55.39010067114094;0.5340050377833753;
 34;55.695405915670236;0.5197313182199832;
 35;55.74905581200168;0.5214105793450882;
 36;56.28541448058761;0.5264483627204031;
 37;56.507136859781696;0.5222502099076406;
 38;56.486146095717885;0.519327731092437;
 39;56.30904891874869;0.5168067226890757;
 40;56.34187316253675;0.5260504201680672;
 41;56.18567527830288;0.5260504201680672;
 42;56.53361344537815;0.5201680672268908;
 43;56.785714285714285;0.5197645079899075;
 44;56.839672200042024;0.5130361648444071;
 45;56.32618747372846;0.5105130361648444;
 46;56.485179735127176;0.511354079058032;
 47;56.518082422203534;0.5063078216989066;
 48;56.72834314550042;0.5084175084175084;
 49;56.8664563617245;0.5134680134680135;
 50;56.6470340765671;0.5058922558922558;
--- a/plots/data/logistic_regression_EMA_20_50_only_daily_enlarged.csv
+++ b/plots/data/logistic_regression_EMA_20_50_only_daily_enlarged.csv
@ -0,0 +1,51 @@
 time_window;training_accuracy;testing_accuracy;
 1;52.05508032547465;0.5392320534223706;
 2;52.10767946577629;0.5392320534223706;
 3;52.27462437395659;0.5396825396825397;
 4;52.43164266332707;0.5388471177944862;
 5;52.546972860125265;0.5355054302422724;
 6;52.28648987262476;0.5329991645781119;
 7;52.02589807852965;0.5338345864661654;
 8;52.54803675856308;0.5317725752508361;
 9;51.932316691038224;0.5359531772575251;
 10;52.08942749686586;0.5309364548494984;
 11;52.18390804597701;0.5317725752508361;
 12;52.65468227424749;0.5267558528428093;
 13;52.612876254180605;0.5263598326359833;
 14;52.64478360861384;0.5263598326359833;
 15;52.676704307821;0.5263598326359833;
 16;52.66680610750889;0.5263598326359833;
 17;52.86610878661088;0.5238493723849372;
 18;52.78242677824267;0.52428810720268;
 19;52.395898723582334;0.5251256281407035;
 20;52.51151109250733;0.525963149078727;
 21;52.54343730374712;0.525963149078727;
 22;52.68006700167505;0.5251256281407035;
 23;52.701005025125625;0.5247275775356245;
 24;52.33507853403141;0.5297569153394803;
 25;52.785923753665685;0.5230511316010059;
 26;52.44081290592919;0.5238893545683152;
 27;52.724224643755235;0.5238893545683152;
 28;52.47275775356245;0.5310402684563759;
 29;52.79815552295116;0.5243288590604027;
 30;52.64150943396226;0.5251677852348994;
 31;52.48479765149927;0.5268456375838926;
 32;52.57969798657718;0.5335570469798657;
 33;53.73322147651006;0.5096557514693535;
 34;53.63960562198448;0.5130142737195634;
 35;53.86067981535879;0.5138539042821159;
 36;53.5781741867786;0.5188916876574308;
 37;53.778337531486144;0.5071368597816961;
 38;53.56842989084802;0.5159663865546219;
 39;53.68465252991812;0.5100840336134453;
 40;53.77992440151197;0.5084033613445378;
 41;53.68620037807184;0.5109243697478991;
 42;53.739495798319325;0.5092436974789916;
 43;53.508403361344534;0.5046257359125316;
 44;54.12901870140786;0.5054667788057191;
 45;53.90920554854981;0.5096719932716569;
 46;54.172797981921384;0.5021026072329688;
 47;54.14213624894869;0.5054667788057191;
 48;53.973927670311184;0.51010101010101;
 49;53.52260778128286;0.5143097643097643;
 50;53.82835506941524;0.5109427609427609;
--- a/plots/data/logistic_regression_SMA.csv
+++ b/plots/data/logistic_regression_SMA.csv
@ -0,0 +1,51 @@
 time_window;training_accuracy;testing_accuracy;
 1;52.05006473888649;0.538860103626943;
 2;52.363479387006265;0.538860103626943;
 3;52.09412780656304;0.5345423143350605;
 4;52.26683937823834;0.5367329299913569;
 5;52.47246814942776;0.5358686257562663;
 6;52.6133909287257;0.5341400172860847;
 7;52.99200691294016;0.5375972342264477;
 8;53.34917891097667;0.5393258426966292;
 9;53.284356093344854;0.5397923875432526;
 10;53.16619840069159;0.5397923875432526;
 11;53.17769130998703;0.5389273356401384;
 12;53.729729729729726;0.5406574394463668;
 13;53.61159169550172;0.5389273356401384;
 14;53.67647058823529;0.5385281385281385;
 15;53.77460523469608;0.5428571428571428;
 16;53.82951103418434;0.5454545454545454;
 17;53.408353170309454;0.5463203463203463;
 18;53.961038961038966;0.5445887445887446;
 19;53.74458874458874;0.5450606585788561;
 20;53.799523706429966;0.5441941074523396;
 21;54.04937202252057;0.5450606585788561;
 22;54.12605588044185;0.5415944540727903;
 23;54.24610051993067;0.5424610051993067;
 24;54.24610051993067;0.546400693842151;
 25;54.58288190682556;0.5437987857762359;
 26;54.65973125270914;0.5403295750216826;
 27;54.19466724474312;0.5420641803989592;
 28;54.48829141370338;0.5394622723330442;
 29;54.48829141370338;0.5399305555555556;
 30;54.304923010193015;0.546875;
 31;54.36008676789588;0.5425347222222222;
 32;55.131264916467785;0.5477430555555556;
 33;55.90277777777778;0.5364583333333334;
 34;55.533854166666664;0.5334491746307559;
 35;55.285435207293254;0.5351867940920938;
 36;55.68823273990448;0.5317115551694179;
 37;55.591748099891426;0.5325803649000869;
 38;55.60382276281495;0.5325803649000869;
 39;55.60382276281495;0.5356521739130434;
 40;56.00695198783402;0.5339130434782609;
 41;56.171229900043464;0.5278260869565218;
 42;56.55292327754836;0.5252173913043479;
 43;56.608695652173914;0.5278260869565218;
 44;56.34782608695652;0.5274151436031331;
 45;56.53402913676886;0.5282854656222803;
 46;56.32883862548934;0.5317667536988686;
 47;56.66739177724603;0.5274151436031331;
 48;56.87554395126197;0.5300261096605744;
 49;56.83202785030461;0.5174216027874564;
 50;56.735582154515775;0.5156794425087108;
--- a/plots/data/logistic_regression_SMA_20_50.csv
+++ b/plots/data/logistic_regression_SMA_20_50.csv
@ -0,0 +1,51 @@
 time_window;training_accuracy;testing_accuracy;
 1;52.04038704249053;0.5412457912457912;
 2;51.90406059330949;0.5412457912457912;
 3;52.02020202020202;0.5370370370370371;
 4;52.20959595959596;0.5459140690817186;
 5;52.68364554830563;0.5408593091828138;
 6;52.71578947368422;0.5341196293176074;
 7;52.76900400084228;0.5391743892165122;
 8;52.906486941870256;0.540016849199663;
 9;53.517270429654594;0.5370994940978078;
 10;53.52854434379608;0.5370994940978078;
 11;53.64517488411293;0.5379426644182125;
 12;54.03582718651212;0.5396290050590219;
 13;54.15261382799326;0.5396290050590219;
 14;54.15261382799326;0.5409282700421941;
 15;54.079696394686906;0.5434599156118144;
 16;54.344158582876425;0.5417721518987342;
 17;54.16578780847922;0.5451476793248945;
 18;53.92405063291139;0.5417721518987342;
 19;53.9873417721519;0.5396959459459459;
 20;54.10424140113948;0.5396959459459459;
 21;54.15787252005065;0.5388513513513513;
 22;54.084863837872064;0.5388513513513513;
 23;53.69510135135135;0.5422297297297297;
 24;53.80067567567568;0.5435333896872359;
 25;54.14994720168954;0.5409974640743872;
 26;53.95014786649768;0.5452240067624683;
 27;54.53200929642933;0.5486052409129332;
 28;54.62806424344886;0.5469146238377007;
 29;54.52240067624683;0.5431472081218274;
 30;54.449376453181145;0.5439932318104906;
 31;54.60887949260042;0.5346869712351946;
 32;55.17022626348065;0.5431472081218274;
 33;55.52030456852792;0.5397631133671743;
 34;55.647208121827404;0.5309060118543607;
 35;55.74360059234187;0.529212531752752;
 36;56.03046974185357;0.5275190516511431;
 37;56.00000000000001;0.5275190516511431;
 38;55.82133784928027;0.5207451312447079;
 39;55.927180355630824;0.5245762711864407;
 40;56.15075164090621;0.5245762711864407;
 41;55.781448538754766;0.5288135593220339;
 42;56.15335733954671;0.5271186440677966;
 43;56.92796610169491;0.5203389830508475;
 44;56.525423728813564;0.5173876166242578;
 45;56.55859292222929;0.5165394402035624;
 46;56.358626536668076;0.5165394402035624;
 47;56.32817468730125;0.5148430873621713;
 48;56.59457167090755;0.5173876166242578;
 49;56.46734520780322;0.5118845500848896;
 50;56.182396606574756;0.5144312393887945;
--- a/plots/data/logistic_regression_only_rets.csv
+++ b/plots/data/logistic_regression_only_rets.csv
@ -0,0 +1,51 @@
 time_window;training_accuracy;testing_accuracy;
 1;50.69893594825787;0.5317195325542571;
 2;50.70951585976628;0.5317195325542571;
 3;50.688647746243745;0.5321637426900585;
 4;50.699227718639115;0.5321637426900585;
 5;50.68893528183715;0.5321637426900585;
 6;50.69951973272082;0.5321637426900585;
 7;50.71010860484545;0.5321637426900585;
 8;50.71010860484545;0.532608695652174;
 9;50.69981199080844;0.532608695652174;
 10;50.68951107396573;0.532608695652174;
 11;50.67920585161965;0.532608695652174;
 12;50.68979933110368;0.532608695652174;
 13;50.68979933110368;0.5330543933054394;
 14;50.700397240225804;0.5330543933054394;
 15;50.71099958176495;0.5330543933054394;
 16;50.700690232169;0.5330543933054394;
 17;50.7112970711297;0.5330543933054394;
 18;50.7112970711297;0.533500837520938;
 19;50.700983469345054;0.533500837520938;
 20;50.69066555043952;0.533500837520938;
 21;50.680343311701904;0.533500837520938;
 22;50.69095477386934;0.533500837520938;
 23;50.69095477386934;0.5331098072087175;
 24;50.680628272251305;0.5331098072087175;
 25;50.69124423963134;0.5331098072087175;
 26;50.68091347161114;0.5331098072087175;
 27;50.69153394803018;0.5331098072087175;
 28;50.69153394803018;0.5327181208053692;
 29;50.7021588765458;0.5327181208053692;
 30;50.712788259958074;0.5327181208053692;
 31;50.7234221010694;0.5327181208053692;
 32;50.73406040268457;0.5327181208053692;
 33;50.713087248322154;0.5331654072208228;
 34;50.72372561359345;0.5331654072208228;
 35;50.7343684431389;0.5331654072208228;
 36;50.724029380902415;0.5331654072208228;
 37;50.71368597816961;0.5331654072208228;
 38;50.73467674223342;0.5327731092436975;
 39;50.72433340331723;0.5327731092436975;
 40;50.734985300293985;0.5327731092436975;
 41;50.72463768115942;0.5327731092436975;
 42;50.71428571428571;0.5327731092436975;
 43;50.71428571428571;0.5323801513877208;
 44;50.724942214751;0.5323801513877208;
 45;50.735603194619586;0.5323801513877208;
 46;50.72524700441454;0.5323801513877208;
 47;50.71488645920942;0.5323801513877208;
 48;50.69386038687973;0.5328282828282829;
 49;50.683491062039955;0.5328282828282829;
 50;50.69415229280606;0.5328282828282829;
--- a/plots/data/logistic_regression_prices.csv
+++ b/plots/data/logistic_regression_prices.csv
@ -0,0 +1,51 @@
 time_window;training_accuracy;testing_accuracy;
 1;51.99248904652618;0.5375626043405676;
 2;51.85726210350584;0.5375626043405676;
 3;52.00333889816361;0.5388471177944862;
 4;52.01419327906491;0.5388471177944862;
 5;52.35908141962422;0.5355054302422724;
 6;52.62058884944665;0.5304928989139516;
 7;52.98663324979115;0.5371762740183793;
 8;52.7360066833751;0.5426421404682275;
 9;53.39461040317527;0.5409698996655519;
 10;53.489343919765986;0.5409698996655519;
 11;53.333333333333336;0.5426421404682275;
 12;53.992474916387955;0.5392976588628763;
 13;54.03428093645485;0.5414225941422595;
 14;54.15011499059168;0.5405857740585774;
 15;54.077791718946045;0.5414225941422595;
 16;54.19368332984731;0.5414225941422595;
 17;54.14225941422595;0.5397489539748954;
 18;53.59832635983264;0.5443886097152428;
 19;53.672316384180796;0.5443886097152428;
 20;54.353285893679356;0.5402010050251256;
 21;54.239062172911865;0.5385259631490787;
 22;54.082914572864325;0.5343383584589615;
 23;54.22948073701842;0.537300922045264;
 24;54.47120418848167;0.5356244761106455;
 25;54.18935902806871;0.5364626990779547;
 26;53.84454221663524;0.5356244761106455;
 27;54.40067057837384;0.539815590947192;
 28;54.90360435875943;0.5444630872483222;
 29;54.43303290714735;0.537751677852349;
 30;54.65408805031446;0.5427852348993288;
 31;54.26714195848186;0.5394295302013423;
 32;55.3481543624161;0.5436241610738255;
 33;55.39010067114094;0.5340050377833753;
 34;55.779316131739044;0.5197313182199832;
 35;55.77003776751993;0.5214105793450882;
 36;56.28541448058761;0.5264483627204031;
 37;56.36020151133502;0.5239294710327456;
 38;56.44416456759026;0.5235294117647059;
 39;56.37203443208062;0.5184873949579832;
 40;56.34187316253675;0.5235294117647059;
 41;56.24868725057761;0.5235294117647059;
 42;56.596638655462186;0.5126050420168067;
 43;56.76470588235294;0.5147182506307821;
 44;56.90271065349863;0.5138772077375946;
 45;56.284153005464475;0.5138772077375946;
 46;56.42211477822157;0.511354079058032;
 47;56.53910849453322;0.5079899074852817;
 48;56.749369217830115;0.5067340067340067;
 49;56.992639327024186;0.5092592592592593;
 50;56.56289440471182;0.51010101010101;
--- a/plots/logistic_regression_EMA.png
+++ b/plots/logistic_regression_EMA.png
--- a/plots/logistic_regression_SMA.png
+++ b/plots/logistic_regression_SMA.png
--- a/plots/logistic_regression_SMA_EMA_comparison.png
+++ b/plots/logistic_regression_SMA_EMA_comparison.png
--- a/plots/logistic_regression_first.png
+++ b/plots/logistic_regression_first.png
--- a/plotter.py
+++ b/plotter.py
@ -0,0 +1,61 @@
 import pandas as pd
 import numpy as np
 import seaborn as sns
 import matplotlib.pyplot as plt
 import os
 #bodacious colors
 colors=sns.color_palette("rocket", 8)
 #Ram's colors, if desired
 seshadri = ['#c3121e', '#0348a1', '#ffb01c', '#027608', '#0193b0', '#9c5300', '#949c01', '#7104b5']
 #            0sangre,   1neptune,  2pumpkin,  3clover,   4denim,    5cocoa,    6cumin,    7berry
 data = pd.read_csv("plots/data/MLP_20_10_5_2.csv", sep=";")
 #data = pd.read_csv("plots/data/logistic_regression.csv", sep=";")
 #data_SMA = pd.read_csv("plots/data/logistic_regression_SMA.csv", sep=";")
 #data_SMA_20_50 = pd.read_csv("plots/data/logistic_regression_SMA_20_50.csv", sep=";")
 #data_EMA = pd.read_csv("plots/data/logistic_regression_EMA.csv", sep=";")
 #data_EMA_20_50 = pd.read_csv("plots/data/logistic_regression_EMA_20_50.csv", sep=";")
 print(data)
 fig = plt.figure(1, figsize=(15,10))
 plt.plot(data["time_window"], data["training_accuracy"]*100, color=seshadri[0], label="Training Accuracy", linewidth=2)
 plt.plot(data["time_window"], data["testing_accuracy"]*100, color=seshadri[1], label="Testing Accuracy", linewidth=2)
 #plt.plot(data["time_window"], data["testing_accuracy"]*100, color=seshadri[0], label="Returns and Volume", linewidth=2)
 #plt.plot(data_SMA_20_50["time_window"], data_SMA_20_50["testing_accuracy"]*100, color=seshadri[1], label="With SMA 20 and 50 candles", linewidth=2)
 #plt.plot(data_SMA["time_window"], data_SMA["testing_accuracy"]*100, color=seshadri[2], label="With SMA 20, 50 and 200 candles", linewidth=2)
 #plt.plot(data_EMA_20_50["time_window"], data_EMA_20_50["testing_accuracy"]*100, color=seshadri[3], label="With EMA 20 and 50 candles", linewidth=2)
 #plt.plot(data_EMA["time_window"], data_EMA["testing_accuracy"]*100, color=seshadri[4], label="With EMA 20, 50 and 200 candles", linewidth=2)
 #plot params
 plt.xlim([0, 50])
 #plt.ylim([50, 60])
 plt.minorticks_on()
 plt.tick_params(labelsize=14)
 plt.tick_params(labelbottom=True, labeltop=False, labelright=False, labelleft=True)
 #xticks = np.arange(0, 1e4,10)
 #yticks = np.arange(0,16.1,4)
 plt.tick_params(direction='in',which='minor', length=5, bottom=True, top=True, left=True, right=True)
 plt.tick_params(direction='in',which='major', length=10, bottom=True, top=True, left=True, right=True)
 #plt.xticks(xticks)
 #plt.yticks(yticks)
 #plt.grid(True)
 #plt.text(1,325, f'y={Decimal(coefs[3]):.4f}x$^3$+{Decimal(coefs[2]):.2f}x$^2$+{Decimal(coefs[1]):.2f}x+{Decimal(coefs[0]):.1f}',fontsize =13)
 plt.xlabel(r'Lag (Days)', fontsize=14) 
 plt.ylabel(r'Accuracy (%)',fontsize=14)  # label the y axis
 plt.legend(fontsize=14, loc="upper right", bbox_to_anchor=(0.99, 0.99))  # add the legend (will default to 'best' location)
 plt.savefig("plots/MLP_20_10_5_2.png", dpi=300)
 plt.show()
--- a/simple_sign_prediction.py
+++ b/simple_sign_prediction.py
@ -0,0 +1,91 @@
 import pandas as pd
 import numpy as np
 import seaborn as sns
 import matplotlib.pyplot as plt
 import yfinance as yf
 from datetime import datetime
 import os
 #bodacious colors
 colors=sns.color_palette("rocket", 8)
 #Ram's colors, if desired
 seshadri = ['#c3121e', '#0348a1', '#ffb01c', '#027608', '#0193b0', '#9c5300', '#949c01', '#7104b5']
 #            0sangre,   1neptune,  2pumpkin,  3clover,   4denim,    5cocoa,    6cumin,    7berry
 #stock_data = pd.read_csv("data/daily_MSFT.csv").iloc[::-1].reset_index(drop=True)
 if os.path.isfile("data/MSFT_data.pkl"):
    stock_data = pd.read_pickle("data/MSFT_data.pkl")
 elif os.path.isfile("data/MSFT_data.csv"):
    stock_data = pd.read_csv("data/MSFT_data.csv")
 else:
    start_date = datetime(2000, 1, 1)
    end_date = datetime(2023, 10, 26)
    stock_data = yf.download('MSFT', start=start_date, end=end_date)
    stock_data.to_pickle("data/MSFT_data.pkl")
    stock_data.to_csv("data/MSFT_data.csv")
 daily_returns = stock_data["Close"] - stock_data["Open"]
 win_lose = np.zeros(daily_returns.size - 1)
 for index, return_ in enumerate(daily_returns[:-1]):
    if (return_ > 0 and daily_returns[index + 1] > 0) or (return_ < 0 and daily_returns[index + 1] < 0):
        win_lose[index] = 1
    else:
        win_lose[index] = 0
 win_rate = np.count_nonzero(win_lose == 1) / win_lose.size
 print(win_rate)
 percent_returns = daily_returns / stock_data["Open"] * 100
 fig = plt.figure(1, figsize=(15,10))
 plt.hist(percent_returns, bins = 120, range=(-12,12), facecolor=seshadri[0], alpha=0.8, edgecolor="white", label="Percentage daily returns occurrences")
 #plt.plot(stock_data.index, stock_data["Close"], linestyle="-", color=seshadri[0])
 #plt.plot(stock_data.index, stock_data["Adj Close"], linestyle="-", color=seshadri[1])
 #plt.plot(stock_data.index, stock_data["close"] - 20, linestyle="-", color=seshadri[2])
 #plt.plot(stock_data.index, stock_data["close"] - 30, linestyle="-", color=seshadri[3])
 #plt.plot(stock_data.index, stock_data["close"] - 40, linestyle="-", color=seshadri[4])
 #plt.plot(stock_data.index, stock_data["close"] - 50, linestyle="-", color=seshadri[5])
 #plt.plot(stock_data.index, stock_data["close"] - 60, linestyle="-", color=seshadri[6])
 #plt.plot(stock_data.index, stock_data["close"] - 70, linestyle="-", color=seshadri[7])
 #plt.show()
 #plot params
 plt.xlim([-12,12])
 #plt.ylim([-0.5,16])
 plt.minorticks_on()
 plt.tick_params(labelsize=14)
 plt.tick_params(labelbottom=True, labeltop=False, labelright=False, labelleft=True)
 #xticks = np.arange(0, 1e4,10)
 #yticks = np.arange(0,16.1,4)
 plt.tick_params(direction='in',which='minor', length=5, bottom=True, top=True, left=True, right=True)
 plt.tick_params(direction='in',which='major', length=10, bottom=True, top=True, left=True, right=True)
 #plt.xticks(xticks)
 #plt.yticks(yticks)
 #plt.text(1,325, f'y={Decimal(coefs[3]):.4f}x$^3$+{Decimal(coefs[2]):.2f}x$^2$+{Decimal(coefs[1]):.2f}x+{Decimal(coefs[0]):.1f}',fontsize =13)
 plt.xlabel(r'Percentage daily return', fontsize=14) 
 plt.ylabel(r'Occurrences',fontsize=14)  # label the y axis
 plt.yscale('log')
 plt.legend(fontsize=14, loc="upper right", bbox_to_anchor=(0.99, 0.99))  # add the legend (will default to 'best' location)
 plt.savefig("plots/MSFT_daily_occs_semilogx.png", dpi=300)
 plt.show()