
#CO2 data
#https://gml.noaa.gov/ccgg/trends/data.html

#Temperature data
# These are anomalies (deviation)
# from the long term mean
#https://data.giss.nasa.gov/gistemp/graphs/graph_data/Global_Mean_Estimates_based_on_Land_and_Ocean_Data/graph.txt


import matplotlib.pyplot as plt
from scipy import stats

# split data into 80% for training
# 20 percent for testing
SPLIT=0.8

def read_csv(text_file,end):
    
 data=[]
 file = open(text_file, 'r')
 for record in file.readlines():
  record=record.strip(end)   
  record=record.split(",")
  data.append(record)
 
 file.close()
 return data

def linear_model(x,slope,intercept):
 line = []
 for i in x:
  line.append(slope * i + intercept)
 return line

def scatter_plot(x_train,y_train,x_test,y_test,line):
 plt.ylabel("Temperature Anomaly (deg C)")
 plt.xlabel("CO2 PPMV")
 plt.scatter(x_train, y_train)
 #plt.scatter(x_test, y_test)
 plt.plot(x_train, line)
 plt.show()

def split_train_test_data(csv_file):
 data=read_csv(csv_file,",\n")

 x=[]
 y=[]
 for i in range(len(data)):
  x.append(float(data[i][0]))
  y.append(float(data[i][1]))
 
 x_train=x[:int(SPLIT*len(data))]
 y_train=y[:int(SPLIT*len(data))]
 y_test=y[int(SPLIT*len(data)):]
 x_test=x[int(SPLIT*len(data)):]

 return x_train,y_train,y_test,x_test


def main():

 # Read in data and split
 # into training and testing 
 x_train,y_train,y_test,x_test=split_train_test_data("co2-v-temp.csv")
 
 # Training phase
 slope, intercept, r, p, std_err = stats.linregress(x_train, y_train)
 line=linear_model(x_train,slope,intercept)

 # test error
 prediction=linear_model(x_test,slope,intercept)
 slope, intercept, r, p, std_err = stats.linregress(y_test, prediction)
 print("Actual:",y_test)
 print("prediction:",prediction)
 print("std_err:",std_err)
 print("r:",r)

 scatter_plot(x_train,y_train,x_test,y_test,line)

 plt.scatter(prediction, y_test)
 plt.plot([0.6,1.1],[0.6,1.1])
 plt.xlabel("Actual Temperature Anomaly")
 plt.ylabel("Predicted Temperature Anomaly")
 plt.show()
 
main()
