from IPython.display import HTML
HTML('''<button type="button" class="btn btn-outline-danger"  onclick="codeToggle();">Toggle Code</button>''')


       
        import pandas as pd

!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials


# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



link = 'https://drive.google.com/file/d/1VFbFLcRmFPNOcz1UgejTRpbR8AWOcMS7/view?usp=sharing'

# to get the id part of the file
id = link.split("/")[-2]

downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('finalFill.csv')

Reg = pd.read_csv('finalFill.csv')
shan_Reg = pd.read_csv('finalFill.csv')

lst = []
for i in Reg.columns:
  max_of_col = Reg[i].max()

  lst.append(max_of_col)
for i in Reg.columns:
  if (i!='site_eui'):
    avg  = Reg[i].mean()
    var = abs(Reg[i].var())
    # print(var)
    Reg[i] = abs((Reg[i]-avg)/Reg[i].abs().max())
    #nor_df = (Nor_Reg-Nor_Reg.mean())/Nor_Reg.std()
    #Reg[i] = abs((Reg[i]-avg)/var)

for i in Reg.columns:
  if(Reg[i].isnull().sum()!=0):
    avg = Reg[i].mean()

    Reg[i].fillna(value = avg , inplace = True)

for i in shan_Reg.columns:
  if(shan_Reg[i].isnull().sum()!=0):
    avg = shan_Reg[i].mean()

    shan_Reg[i].fillna(value = avg , inplace = True)

print(Reg.var())

Year_Factor                                  0.015131
State_Factor                                 0.028297
building_class                               0.001374
floor_area                                   0.001090
year_built                                   0.000143
                                               ...   
Warehouse_Distribution_or_Shipping_center    0.007659
Warehouse_Nonrefrigerated                    0.015230
Warehouse_Refrigerated                       0.001481
Warehouse_Selfstorage                        0.007330
Warehouse_Uncategorized                      0.004262
Length: 123, dtype: float64


       
        import pandas as pd
import numpy as np
import sklearn
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn import datasets, linear_model
from google.colab import files

#Regression

col_list=[]
for col in Reg.columns:
  if (col != 'site_eui'):
    col_list.append(col)

X = Reg[col_list]
Y = Reg['site_eui']

train_X = X[:60000]
train_Y = Y[:60000]

model = LinearRegression()
model.fit(train_X, train_Y)
r_sq = model.score(train_X, train_Y)

# print(r_sq)

test_X = X[60000:]
test_Y = Y[60000:]


r_sq = model.score(test_X, test_Y)
# print(r_sq)


pred_Y = model.predict(test_X)

# print(model.coef_)


       
        ##SVD
import numpy as np
from numpy.linalg import svd, matrix_rank
#
def compress_svd(matrix,k):
    """
    Perform svd decomposition and truncated (using k singular values/vectors) reconstruction
    returns
    --------
      reconstructed matrix reconst_matrix, array of singular values s
    """
    U,s,V = svd(matrix,full_matrices=False)
    reconst_matrix = np.dot(U[:,:k],np.dot(np.diag(s[:k]),V[:k,:]))
   
    return reconst_matrix,s

reg_ = Reg
reg_ = reg_.drop(columns = ['site_eui'])
arr = reg_.to_numpy(dtype = np.float)

svd_r_sq = []

for i in range(1,123):
  reconst_matrix, s = compress_svd(arr, i)

  reg = pd.read_csv('finalFill.csv')
  #
  col_lst = []
  for col in reg.columns:
    if (col!='site_eui'):
      col_lst.append(col)

  df = pd.DataFrame(reconst_matrix, columns = col_lst)

  X1_svd = df[col_lst]
  Y1_svd = Reg['site_eui']

  #train_svd_X = X1_svd[:60000]
  #train_svd_Y = Y1_svd[:60000]

  #model_svd = LinearRegression()
  #model_svd.fit(train_svd_X, train_svd_Y)
  #r_sq = model_svd.score(train_svd_X, train_svd_Y)
  #print(r_sq)

  test_svd_X = X1_svd[60000:]
  test_svd_Y = Y1_svd[60000:]
  r_sq = model.score(test_svd_X, test_svd_Y)
  svd_r_sq.append(r_sq)
  # print(r_sq)

  #plot
  # x->k, y->svd_r_sq

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:19: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


       
        #plotting
from matplotlib import pyplot as plt

K = [i for i in range(1,123)]

fig = plt.figure(figsize=(8,6)) #set up our matplotlib figure layout

plt.xlabel('k',fontsize=20)
plt.ylabel('svd_r_sq',fontsize=20)
plt.title('Using Model M2',fontsize=20)

plt.plot(K,svd_r_sq, linestyle = 'solid',color = 'blue')

[<matplotlib.lines.Line2D at 0x7f6b5fe02b10>]


       
        ##SVD
import numpy as np
from numpy.linalg import svd, matrix_rank
#
def compress_svd(image,k):
    """
    Perform svd decomposition and truncated (using k singular values/vectors) reconstruction
    returns
    --------
      reconstructed matrix _reconst_matrix, array of singular values s
    """
    U,s,V = svd(image,full_matrices=False)
    _reconst_matrix = np.dot(U[:,:k],np.dot(np.diag(s[:k]),V[:k,:]))
   
    return _reconst_matrix,s

_reg_ = Reg
_reg_ = _reg_.drop(columns = ['site_eui'])
arr = _reg_.to_numpy(dtype = np.float)

svd_r_sq = []

for i in range(1,123):
  _reconst_matrix, s = compress_svd(arr, i)

  #print(_reconst_matrix)

  reg_new = pd.read_csv('finalFill.csv')
  #
  col_lst = []
  for col in Reg.columns:
    if (col!='site_eui'):
      col_lst.append(col)

  _df = pd.DataFrame(_reconst_matrix, columns = col_lst)
  #print()
  #print("Before Normalization DF : ", _df)
  #
  #lst = []
  #for i in _df.columns:
  #  max_of_col = _df[i].max()
  #
  #  lst.append(max_of_col)
  #
  #for i in _df.columns:
  #  if (i!='site_eui'):
  #    avg  = _df[i].mean()
  #    var = abs(_df[i].var())
  #    #print(var)
  #    _df[i] = abs((_df[i]-avg)/_df[i].abs().max())
  #    #_df[i] = abs((_df[i]-avg)/var)
  #
  #print("After Normalization DF : ",_df)

  X2_svd = _df[col_lst]
  Y2_svd = Reg['site_eui']

  train_svd_X = X2_svd[:60000]
  train_svd_Y = Y2_svd[:60000]

  model_svd = LinearRegression()
  model_svd.fit(train_svd_X, train_svd_Y)
  r_sq = model_svd.score(train_svd_X, train_svd_Y)
  # print(r_sq)

  test_svd_X = X2_svd[60000:]
  test_svd_Y = Y2_svd[60000:]

  # print(len(test_svd_X), len(test_svd_Y))
  r_sq = model_svd.score(test_svd_X, test_svd_Y)
  # print(r_sq)

  # print(model_svd.coef_)

  pred_y_sh = model_svd.predict(test_svd_X)

  count = 0

  for i in range(15757):
    if (abs(pred_y_sh[i]) > 10000):
      X2_svd = X2_svd.drop(i+60000)
      Y2_svd = Y2_svd.drop(i+60000)
      count+=1
  # print(len(Y2_svd)-75757)

  # print(count)
  new_test_X = X2_svd[60000:]
  new_test_Y = Y2_svd[60000:]
  r_sq = model_svd.score(new_test_X, new_test_Y)
  svd_r_sq.append(r_sq)
  # print(r_sq)

  pred_y_sh = model_svd.predict(new_test_X)
  # print(pred_y_sh)

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:19: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


       
        #plotting
from matplotlib import pyplot as plt

K = [i for i in range(1,123)]

fig = plt.figure(figsize=(8,6)) #set up our matplotlib figure layout

plt.xlabel('k',fontsize=20)
plt.ylabel('svd_r_sq',fontsize=20)
plt.title('Using Model M3',fontsize=20)

plt.plot(K,svd_r_sq, linestyle = 'solid',color = 'red')

[<matplotlib.lines.Line2D at 0x7f6b5fd430d0>]


       
        new_reg = Reg
col_list=[]
for col in Reg.columns:
  if (col != 'site_eui'):
    col_list.append(col)
new_X = Reg[col_list]
new_Y = Reg['site_eui']

new_train_X = new_X[:60000]
new_train_Y = new_Y[:60000]

new_test_X = new_X[60000:]
new_test_Y = new_Y[60000:]

U_, s_, V_t = np.linalg.svd(new_train_X, full_matrices=False)

x_hat = V_t.T @ np.linalg.inv(np.diag(s_)) @ U_.T @ new_train_Y

y_pred = new_train_X @ x_hat
test_predictions = new_test_X @ x_hat

#train_mse = np.mean((train_predictions - new_train_Y)**2)
test_mse = np.mean((test_predictions - new_test_Y)**2)

#print("Train Mean Squared Error:", train_mse)
print("Test Mean Squared Error:", test_mse)

Test Mean Squared Error: 7.546462274010733e+29


       
        import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

shan_reg_ = shan_Reg
shan_reg_ = shan_reg_.drop(columns = ['site_eui'])
shan_arr = shan_reg_.to_numpy(dtype = np.float)
pca = PCA(n_components = 122)
pca.fit(shan_arr)

arr_new = pca.transform(shan_arr)
print(arr_new.shape)

fig = plt.figure(figsize=(8,6))
plt.xlabel('First Singular Vector',fontsize=20)
plt.ylabel('Second Singular Vector',fontsize=20)
plt.title('First Singular Vector VS Second Singular Vector',fontsize=20)
plot = plt.scatter(arr_new[:,0], arr_new[:,1])
#plt.legend(handles=plot.legend_elements()[0], labels=list(winedata['target_names']))
plt.show()
print(arr_new.shape)

n_pcs= pca.components_.shape[0]

# get the index of the most important feature on EACH component
# LIST COMPREHENSION HERE
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
print(len(most_important))

most_important_names = [shan_reg_.columns[most_important[i]] for i in range(n_pcs)]
print(most_important_names)

X_pca = arr_new
y_pca = Reg['site_eui']

train_x_pca = X_pca[:60000, :]
train_y_pca = y_pca[:60000]

model_pca = LinearRegression()
model_pca.fit(train_x_pca, train_y_pca)

test_x_pca = X_pca[60000:, :]
test_y_pca = y_pca[60000:]

new_rsq = model_pca.score(test_x_pca, test_y_pca)
print(new_rsq)

pred_pca_y = model_pca.predict(test_x_pca)
count=0
for i in range(15757):
  if (pred_pca_y[i] > 10000):
    count+=1
print(count)

print(model_pca.coef_)

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:7: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  import sys

(75757, 122)

(75757, 122)
122
['floor_area', 'id', 'heating_degree_days', 'cooling_degree_days', 'snowdepth_inches', 'direction_peak_wind_speed', 'ELEVATION', 'year_built', 'days_with_fog', 'energy_star_rating', 'days_above_80F', 'direction_peak_wind_speed', 'days_below_30F', 'snowfall_inches', 'february_min_temp', 'precipitation_inches', 'march_min_temp', 'days_above_80F', 'days_below_20F', 'days_above_90F', 'snowfall_inches', 'april_max_temp', 'july_max_temp', 'days_below_20F', 'days_above_100F', 'august_max_temp', 'september_max_temp', 'february_max_temp', 'september_max_temp', 'may_min_temp', 'september_min_temp', 'september_min_temp', 'june_min_temp', 'august_max_temp', 'days_above_100F', 'january_min_temp', 'max_wind_speed', 'building_class', 'max_wind_speed', 'august_max_temp', 'december_max_temp', 'days_below_0F', 'State_Factor', 'april_max_temp', 'Office_Uncategorized', 'Office_Uncategorized', 'february_max_temp', 'january_avg_temp', 'march_avg_temp', 'november_avg_temp', 'Education_Other_classroom', 'State_Factor', 'february_avg_temp', 'december_avg_temp', '2to4_Unit_Building', 'Lodging_Hotel', 'december_avg_temp', 'Commercial_Other', '5plus_Unit_Building', 'Warehouse_Nonrefrigerated', 'Year_Factor', 'Education_College_or_university', 'february_avg_temp', 'november_avg_temp', 'Education_College_or_university', 'Mixed_Use_Commercial_and_Residential', 'Nursing_Home', 'may_avg_temp', 'Lodging_Dormitory_or_fraternity_sorority', 'Warehouse_Selfstorage', 'days_above_110F', 'Warehouse_Selfstorage', 'june_avg_temp', 'Office_Medical_non_diagnostic', 'Grocery_store_or_food_market', 'Health_Care_Inpatient', 'Industrial', 'Warehouse_Uncategorized', 'days_above_110F', 'Mixed_Use_Predominantly_Commercial', 'Parking_Garage', 'Education_Uncategorized', 'Education_Uncategorized', 'Office_Bank_or_other_financial', 'Public_Safety_Fire_or_police_station', 'Public_Assembly_Other', 'Service_Vehicle_service_repair_shop', 'Retail_Enclosed_mall', 'Retail_Enclosed_mall', 'Warehouse_Refrigerated', 'Public_Assembly_Entertainment_culture', 'Education_Preschool_or_daycare', 'Laboratory', 'Public_Assembly_Social_meeting', 'Public_Assembly_Social_meeting', 'Commercial_Unknown', 'Lodging_Other', 'Lodging_Other', 'Public_Assembly_Recreation', 'Public_Assembly_Drama_theater', 'Food_Sales', 'Service_Uncategorized', 'Food_Service_Restaurant_or_cafeteria', 'Health_Care_Outpatient_Clinic', 'Health_Care_Uncategorized', 'Public_Safety_Uncategorized', 'Public_Assembly_Movie_Theater', 'Public_Safety_Courthouse', 'Public_Safety_Penitentiary', 'Health_Care_Outpatient_Uncategorized', 'Data_Center', 'Public_Assembly_Uncategorized', 'Food_Service_Uncategorized', 'Office_Mixed_use', 'Food_Service_Other', 'Mixed_Use_Predominantly_Residential', 'Service_Drycleaning_or_Laundry', 'Public_Assembly_Stadium', 'Lodging_Uncategorized', 'avg_temp', 'Mixed_Use_Predominantly_Commercial', 'building_class']
-7560152979505.693
0
[ 2.67040792e+01 -9.38098487e+02  1.77238577e+04  6.52355458e+04
  7.54249811e+03 -3.36588908e+04 -2.99780713e+04  8.71473361e+04
 -5.35556868e+04  9.28377475e+04 -3.10096943e+05 -1.47878650e+05
  8.05530536e+05 -9.60254596e+05 -4.51886052e+05  1.16330390e+06
  2.07883016e+05 -5.03184891e+05  9.15026258e+05 -1.43481815e+06
  1.58145092e+06 -1.93957221e+06 -3.01029142e+06 -2.60348623e+05
  7.59835758e+05  3.81110529e+06 -1.87052393e+06 -8.28805321e+05
  7.93782148e+05 -1.87044693e+06 -2.19346960e+06  3.21021047e+06
  2.29366087e+05 -1.65464719e+06 -1.31286390e+06  4.33338563e+06
 -2.10292150e+06 -1.23263738e+07  6.23883993e+05 -5.09447525e+06
  9.60526558e+05 -6.33749740e+05  6.95910960e+06  2.54689590e+06
 -1.14828699e+06  4.10279201e+05  1.52330275e+06 -2.25671487e+06
  5.89551996e+06 -6.95321793e+06 -6.24212627e+06  1.70868359e+06
  6.65838705e+06 -4.49746268e+07  2.59280498e+08  1.92059844e+08
  3.66685476e+06 -2.40789248e+08  1.29695796e+08 -6.22651302e+06
  9.60849122e+06  4.24567301e+06  3.31943556e+06 -7.46686154e+06
  2.07882702e+07  1.42462508e+08  1.36586197e+07  9.23411307e+06
  1.30948764e+07 -1.04426941e+06  3.73946492e+06 -7.39253055e+06
 -2.61475621e+06  4.44225423e+05 -2.36576101e+06  3.29855584e+06
 -4.80155487e+06 -4.60338361e+07 -8.65659999e+06  9.29051344e+07
  8.33861977e+07 -4.90238952e+07  8.97360883e+06 -6.86099623e+06
  3.20686408e+06  8.39617617e+06  3.26401109e+06  8.61411117e+06
 -1.44479053e+06 -3.98089612e+06  2.34000000e+06  8.24318848e+05
 -2.07824972e+06 -6.15181274e+06  5.36143212e+06  4.85796380e+06
  5.76451706e+06 -1.63870014e+06  1.38271835e+06  1.44329622e+06
 -1.40793147e+05  9.40006148e+05 -9.82496149e+05  3.35404076e+06
  3.27683638e+06  4.14982226e+06  4.31332553e+06  1.48756672e+06
  1.76086498e+06  9.74300184e+05  3.46516551e+06  2.53370529e+06
  4.39493531e+06 -9.06841220e+04  2.16203852e+06  7.96641769e+07
  9.75725738e+05  3.08849647e+04  2.18082659e+06  3.34981301e+05
  5.95185014e-06  3.94317237e-03]


       
        import numpy as np
import pandas as pd
from numpy.linalg import svd, matrix_rank
reg = Reg
reg = reg.drop(columns = ['site_eui'])
arr = reg.to_numpy(dtype = np.float)
U, s, V = svd(arr, full_matrices =False)

svd_r_sq = []

for i in range(1,116):
  X_svd = np.dot(arr , V[:, :i])
  Y_svd = Reg['site_eui']

  train_svd_X = X_svd[:60000]
  train_svd_Y = Y_svd[:60000]

  model_svd = LinearRegression()
  model_svd.fit(train_svd_X, train_svd_Y)

  test_svd_x = X_svd[60000:]
  test_svd_y = Y_svd[60000:]
  pred_Y_new = model_svd.predict(test_svd_x)
  # print(pred_Y_new)
  r_sq = model_svd.score(test_svd_x, test_svd_y)
  svd_r_sq.append(r_sq)
  # print(r_sq)

  # print(X_svd.shape)
  # print(model_svd.coef_)

#plot

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:6: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


       
        #plotting
from matplotlib import pyplot as plt

K = [i for i in range(1,116)]

fig = plt.figure(figsize=(8,6)) #set up our matplotlib figure layout

plt.xlabel('k',fontsize=20)
plt.ylabel('svd_r_sq',fontsize=20)
plt.title('Modelling using reduced columns',fontsize=20)

plt.plot(K,svd_r_sq, linestyle = 'solid',color = 'orange')

[<matplotlib.lines.Line2D at 0x7f6be038a510>]


       
        Nor_Reg = pd.read_csv('finalFill.csv')
Nor_Reg = Nor_Reg.drop(columns = ['site_eui'])
col_lst_unused = []
for i in Nor_Reg.columns:
  if (Nor_Reg.columns.get_loc(i)>61):
    col_lst_unused.append(i)

Nor_Reg = Nor_Reg.drop(columns = col_lst_unused)

# print(Nor_Reg)

nor_df = (Nor_Reg-Nor_Reg.mean())/Nor_Reg.std()
# print(Nor_Reg.var())
# print(nor_df.var())
for i in nor_df.columns:
  if(nor_df[i].isnull().sum()!=0):
    avg = nor_df[i].mean()

    nor_df[i].fillna(value = avg , inplace = True)


       
        import pandas as pd
import numpy as np
import sklearn
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn import datasets, linear_model
from google.colab import files

#Regression

col_list=[]
for col in nor_df.columns:
  if (col != 'site_eui'):
    col_list.append(col)

Nor_X = nor_df[col_list]
Nor_Y = Reg['site_eui']

Nor_train_X = Nor_X[:60000]
Nor_train_Y = Nor_Y[:60000]

Nor_model = LinearRegression()
Nor_model.fit(Nor_train_X, Nor_train_Y)
r_sq = Nor_model.score(Nor_train_X, Nor_train_Y)

#print(r_sq)

Nor_test_X = Nor_X[60000:]
Nor_test_Y = Nor_Y[60000:]


r_sq = Nor_model.score(Nor_test_X, Nor_test_Y)
# print(r_sq)


pred_Y = Nor_model.predict(Nor_test_X)

#print(Nor_X)
arr_nor = Nor_test_Y.to_numpy(dtype = np.float)
#arr_pred_Y = pred_Y.to_numpy(dtype = np.float)
arr_nor = arr_nor.reshape((Nor_test_Y.shape[0],1))
#arr_pred_Y = arr_pred_Y.reshape((Nor_test_Y.shape[0],1))
# print(arr_nor.shape)
sum=0

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:41: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


       
        from tensorflow.python.framework.registry import Registry
##SVD
import numpy as np
from numpy.linalg import svd, matrix_rank
#
def compress_svd(image,k):
    """
    Perform svd decomposition and truncated (using k singular values/vectors) reconstruction
    returns
    --------
      reconstructed matrix Nor_reconst_matrix, array of singular values s
    """
    U,s,V = svd(image,full_matrices=False)
    Nor_reconst_matrix = np.dot(U[:,:k],np.dot(np.diag(s[:k]),V[:k,:]))
   
    return Nor_reconst_matrix,s

Nor_reg_ = nor_df
#Nor_reg_ = Nor_reg_.drop(columns = ['site_eui'])
arr = Nor_reg_.to_numpy(dtype = np.float)

svd_r_sq = []

for i in range(1,123):
  Nor_reconst_matrix, s = compress_svd(arr, i)

  #print(Nor_reconst_matrix)

  reg_new = nor_df
  #
  col_lst = []
  for col in nor_df.columns:
    if (col!='site_eui'):
      col_lst.append(col)

  _df = pd.DataFrame(Nor_reconst_matrix, columns = col_lst)

  #print("Before Normalization DF : ", _df)

  #_df=(_df-_df.min())/(_df.max()-_df.min())
  #_df = abs((_df-_df.mean())/_df.std())

  #print("After Normalization DF : ",_df)

  X2_svd = _df[col_lst]
  Y2_svd = Reg['site_eui']

  train_svd_X = X2_svd[:60000]
  train_svd_Y = Y2_svd[:60000]

  model_svd = LinearRegression()
  model_svd.fit(train_svd_X, train_svd_Y)
  r_sq = model_svd.score(train_svd_X, train_svd_Y)
  # print(r_sq)

  test_svd_X = X2_svd[60000:]
  test_svd_Y = Y2_svd[60000:]
  r_sq = model_svd.score(test_svd_X, test_svd_Y)
  svd_r_sq.append(r_sq)
  # print(r_sq)

  #print(model_svd.coef_)

  #plot

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:20: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


       
        #plotting
from matplotlib import pyplot as plt

K = [i for i in range(1,123)]

fig = plt.figure(figsize=(8,6)) #set up our matplotlib figure layout

plt.xlabel('k',fontsize=20)
plt.ylabel('svd_r_sq',fontsize=20)
plt.title('Modelling by dropping Column name "facility type"',fontsize=20)

plt.plot(K,svd_r_sq, linestyle = 'solid',color = 'green')

[<matplotlib.lines.Line2D at 0x7f6b5fc52d90>]


       
        import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
reg = nor_df
#reg = reg.drop(columns = ['site_eui'])
arr = reg.to_numpy(dtype = np.float)
U, s, V = svd(arr, full_matrices =False)
var_explained = np.round(s**2/np.sum(s**2), decimals=3)
var_explained
 
sns.barplot(x=list(range(1,len(var_explained)+1)),
            y=var_explained, color="limegreen")
plt.xlabel('SVs', fontsize=16)
plt.ylabel('Percent Variance Explained', fontsize=16)
#plt.savefig('svd_scree_plot.png',dpi=10)

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:6: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

Text(0, 0.5, 'Percent Variance Explained')


       
        import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

_reg_ = nor_df
#_reg_ = _reg_.drop(columns = ['site_eui'])
arr = _reg_.to_numpy(dtype = np.float)

pca = PCA(n_components = 62)
pca.fit(arr)

arr_new = pca.transform(arr)
plot = plt.scatter(arr_new[:,0], arr_new[:,1])
#plt.legend(handles=plot.legend_elements()[0], labels=list(winedata['target_names']))
plt.show()
print(arr_new.shape)

n_pcs= pca.components_.shape[0]

# get the index of the most important feature on EACH component
# LIST COMPREHENSION HERE
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
print(len(most_important))

most_important_names = [Reg.columns[most_important[i]] for i in range(n_pcs)]
print(most_important_names)

X_pca = arr_new
y_pca = Reg['site_eui']

train_x_pca = X_pca[:60000, :]
train_y_pca = y_pca[:60000]

model_pca = LinearRegression()
model_pca.fit(train_x_pca, train_y_pca)

test_x_pca = X_pca[60000:, :]
test_y_pca = y_pca[60000:]

new_rsq = model_pca.score(test_x_pca, test_y_pca)
print(new_rsq)

pred_pca_y = model_pca.predict(test_x_pca)
count=0
for i in range(15757):
  if (pred_pca_y[i] > 10000):
    count+=1
print(count)

print(model_pca.coef_)

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:7: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  import sys

(75757, 62)
62
['january_avg_temp', 'september_avg_temp', 'december_min_temp', 'november_max_temp', 'Year_Factor', 'july_max_temp', 'may_min_temp', 'days_above_100F', 'building_class', 'floor_area', 'year_built', 'year_built', 'days_above_110F', 'building_class', 'building_class', 'energy_star_rating', 'direction_peak_wind_speed', 'ELEVATION', 'january_max_temp', 'days_below_0F', 'ELEVATION', 'days_with_fog', 'may_avg_temp', 'days_above_100F', 'october_max_temp', 'Year_Factor', 'september_max_temp', 'precipitation_inches', 'september_min_temp', 'august_min_temp', 'days_above_90F', 'days_below_20F', 'november_min_temp', 'direction_peak_wind_speed', 'may_min_temp', 'october_max_temp', 'september_avg_temp', 'december_max_temp', 'may_max_temp', 'days_above_80F', 'february_min_temp', 'january_max_temp', 'july_min_temp', 'State_Factor', 'direction_max_wind_speed', 'april_min_temp', 'direction_max_wind_speed', 'january_avg_temp', 'october_min_temp', 'march_min_temp', 'cooling_degree_days', 'cooling_degree_days', 'january_min_temp', 'may_avg_temp', 'march_avg_temp', 'november_avg_temp', 'february_min_temp', 'december_avg_temp', 'july_avg_temp', 'february_avg_temp', 'heating_degree_days', 'avg_temp']
-0.6417806081818256
0
[-1.48299673e-01 -4.54648734e+00 -1.02707102e+00 -1.26141220e+00
 -1.50997382e+00  4.57411012e+00 -1.18956012e+00  2.79937117e+00
  4.29332705e+00 -2.89943509e+00 -8.67056076e-01 -3.52138797e+00
  4.19610578e-01 -3.18250129e+00  6.84422523e+00  2.76707005e+01
  1.40779083e+00  2.15293309e-01 -4.67991316e+00 -1.16240812e-02
 -5.84589225e+00 -2.68123452e+00  8.82469452e-01  1.73565252e+00
  8.08591328e+00 -3.63838364e-02 -5.89295600e-02 -3.14422522e+00
 -5.14518367e+00  4.29420099e+00 -1.79482170e+00  1.02534298e+01
 -1.57997890e+00 -1.97512742e+00 -1.89266082e+00 -1.42918741e+00
 -4.48318056e+00 -3.74796996e+00  6.67172588e+00  1.67262729e+00
  2.88198029e+00 -4.68083596e+00 -1.62658539e+01  5.19605382e+00
  2.48528584e+00 -1.64521023e+01  9.12486878e+00  1.60958087e+01
 -4.85199781e-01 -2.06501028e+01  1.10443380e+01 -8.64974327e+00
 -2.72488170e-01 -2.01659957e-01 -1.84077433e+01 -8.21386394e+00
  1.25349110e+01 -1.94398292e+01 -2.79143916e+01  4.07828688e+00
 -1.01283515e+02  1.55036623e+02]


       
        #Nor_Reg = pd.read_csv('finalFill.csv')
#Nor_Reg = Nor_Reg.drop(columns = ['site_eui'])
col_lst_unused = []
for i in Reg.columns:
  if (Reg.columns.get_loc(i)>61):
    col_lst_unused.append(i)

Reg = Reg.drop(columns = col_lst_unused)
#
#print(Nor_Reg)
#
#Reg = (Nor_Reg-Nor_Reg.mean())/Nor_Reg.std()
#print(Nor_Reg.var())
#print(Reg.var())

#for i in Reg.columns:
#  if(Reg[i].isnull().sum()!=0):
#    avg = Reg[i].mean()
#
#    Reg[i].fillna(value = avg , inplace = True)


       
        import pandas as pd
import numpy as np
import sklearn
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn import datasets, linear_model
from google.colab import files

#Regression

col_list=[]
for col in Reg.columns:
  if (col != 'site_eui'):
    col_list.append(col)

Nor_X = Reg[col_list]
Nor_Y = Reg['site_eui']

Nor_train_X = Nor_X[:60000]
Nor_train_Y = Nor_Y[:60000]

Nor_model = LinearRegression()
Nor_model.fit(Nor_train_X, Nor_train_Y)
r_sq = Nor_model.score(Nor_train_X, Nor_train_Y)

#print(r_sq)

Nor_test_X = Nor_X[60000:]
Nor_test_Y = Nor_Y[60000:]


r_sq = Nor_model.score(Nor_test_X, Nor_test_Y)
# print(r_sq)


pred_Y = Nor_model.predict(Nor_test_X)

#print(Nor_X)
arr_nor = Nor_test_Y.to_numpy(dtype = np.float)
#arr_pred_Y = pred_Y.to_numpy(dtype = np.float)
arr_nor = arr_nor.reshape((Nor_test_Y.shape[0],1))
#arr_pred_Y = arr_pred_Y.reshape((Nor_test_Y.shape[0],1))
# print(arr_nor.shape)
sum=0

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:41: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


       
        from tensorflow.python.framework.registry import Registry
##SVD
import numpy as np
from numpy.linalg import svd, matrix_rank
#
def compress_svd(image,k):
    """
    Perform svd decomposition and truncated (using k singular values/vectors) reconstruction
    returns
    --------
      reconstructed matrix Nor_reconst_matrix, array of singular values s
    """
    U,s,V = svd(image,full_matrices=False)
    Nor_reconst_matrix = np.dot(U[:,:k],np.dot(np.diag(s[:k]),V[:k,:]))
   
    return Nor_reconst_matrix,s

Nor_reg_ = Reg
Nor_reg_ = Nor_reg_.drop(columns = ['site_eui'])
arr = Nor_reg_.to_numpy(dtype = np.float)

svd_r_sq = []

for i in range(1,123):
  Nor_reconst_matrix, s = compress_svd(arr, i)

  #print(Nor_reconst_matrix)

  reg_new = Reg
  #
  col_lst = []
  for col in Reg.columns:
    if (col!='site_eui'):
      col_lst.append(col)

  _df = pd.DataFrame(Nor_reconst_matrix, columns = col_lst)

  #print("Before Normalization DF : ", _df)

  #_df=(_df-_df.min())/(_df.max()-_df.min())
  #_df = abs((_df-_df.mean())/_df.std())

  #print("After Normalization DF : ",_df)

  X2_svd = _df[col_lst]
  Y2_svd = Reg['site_eui']

  train_svd_X = X2_svd[:60000]
  train_svd_Y = Y2_svd[:60000]

  model_svd = LinearRegression()
  model_svd.fit(train_svd_X, train_svd_Y)
  r_sq = model_svd.score(train_svd_X, train_svd_Y)
  # print(r_sq)

  test_svd_X = X2_svd[60000:]
  test_svd_Y = Y2_svd[60000:]
  r_sq = model_svd.score(test_svd_X, test_svd_Y)
  svd_r_sq.append(r_sq)
  # print(r_sq)

  #print(model_svd.coef_)

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:20: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


       
        #plotting
from matplotlib import pyplot as plt

K = [i for i in range(1,123)]

fig = plt.figure(figsize=(8,6)) #set up our matplotlib figure layout

plt.xlabel('k',fontsize=20)
plt.ylabel('svd_r_sq',fontsize=20)
plt.title('Model Score VS K',fontsize=20)

plt.plot(K,svd_r_sq, linestyle = 'solid',color = 'pink')

[<matplotlib.lines.Line2D at 0x7f6b5f883850>]

Climate Change Data Analysis

Aditya Shakya , IIT Gandhinagar, s.aditya@iitgn.ac.in

Shantanu Sahu , IIT Gandhinagar, shantanu.s@iitgn.ac.in

Varun Barala , IIT Gandhinagar, barala.v@iitgn.ac.in

Objective ¶

Dataset ¶

Importing File For Regression ¶

Linear regression Modeling ¶

Linear Regression ¶

Applying Single Value Decomposition ¶

Using Old Model M2 ¶

Linear Reg with SVD (using Model M2) ¶

Using New model M3 ¶

Linear Regression with SVD on model M3 ¶

Calculating MSE ¶

Principal Component Analysis (PCA) Analysis ¶

PCA Analysis ¶

Some New Modeling Ways: ¶

By reducing the number of columns to K(reduced dimension); ¶

Deleting column name with “facility_type” ¶

First k dimension plot ¶

PCA Analysis after dropping Facility Type ¶

Removing Facitility type and using original Normalization ¶

Climate Change Data Analysis

Aditya Shakya , IIT Gandhinagar, s.aditya@iitgn.ac.in Shantanu Sahu , IIT Gandhinagar, shantanu.s@iitgn.ac.in Varun Barala , IIT Gandhinagar, barala.v@iitgn.ac.in

Objective ¶

Dataset ¶

Importing File For Regression ¶

Linear regression Modeling ¶

Linear Regression ¶

Applying Single Value Decomposition ¶

Using Old Model M2 ¶

Linear Reg with SVD (using Model M2) ¶

Using New model M3 ¶

Linear Regression with SVD on model M3 ¶

Calculating MSE ¶

Principal Component Analysis (PCA) Analysis ¶

PCA Analysis ¶

Some New Modeling Ways: ¶

By reducing the number of columns to K(reduced dimension); ¶

Deleting column name with “facility_type” ¶

First k dimension plot ¶

PCA Analysis after dropping Facility Type ¶

Removing Facitility type and using original Normalization ¶

Aditya Shakya , IIT Gandhinagar, s.aditya@iitgn.ac.in

Shantanu Sahu , IIT Gandhinagar, shantanu.s@iitgn.ac.in

Varun Barala , IIT Gandhinagar, barala.v@iitgn.ac.in