import pandas as pd
import numpy as np
import pandas as pd
from fbprophet import Prophet
import datetime as dt
from sklearn import metrics
import math
import itertools
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
calendar = pd.read_csv('calendar.csv')
calendar
sales_data = pd.read_csv('https://tmpkaggle.s3.me-south-1.amazonaws.com/sales_train_evaluation.csv')
Xcols = ['d_' + str(i) for i in range(1, 1914)]
ycols = ['d_' + str(i) for i in range(1914, 1942)]
ids = sales_data.item_id + '_' + sales_data.store_id
train = sales_data[Xcols]
train = train.T
train.columns = ids
#train
test = sales_data[ycols]
test = test.T
test.columns = ids
#test
train
test
params = [[3, 5, 10, 0.5, 0.5, 0.5],[15, 25, 50, 0.1, 0.1, 0.1],[20, 30, 60, 0.3, 0.3, 0.3],[2, 4, 8, 0.8, 0.8, 0.8],
[2, 2, 4, 0.8, 0.8, 0.8]]
list(params)
# prepare calendar data
calendar['datelist'] = calendar['date'].apply(lambda x: [x])
calendarnew = pd.DataFrame(calendar.groupby('event_name_1')['datelist'].sum()).merge(pd.DataFrame(calendar.groupby('event_name_2')['datelist'].sum()),
left_index = True, right_index = True, how = 'outer')
calendarnew.datelist_y = [x if type(x) == list else [] for x in calendarnew.datelist_y ]
calendarnew.datelist_x = calendarnew.datelist_x + calendarnew.datelist_y
calendarnew = calendarnew.drop('datelist_y', 1)
# prepare holidays data
holidayslist = []
for i, row in calendarnew.iterrows():
holidayslist.append(
pd.DataFrame({
'holiday': i,
'ds': pd.to_datetime(list(row)[0]),
'lower_window': 0,
'upper_window': 1
})
)
holidaysconcat = pd.concat(holidayslist)
# function to build one model
def build_model(pars):
wseas, mseas, yseas, s_prior, h_prior, c_prior= pars
m = Prophet(growth = 'linear',
holidays = holidaysconcat,
daily_seasonality = False,
weekly_seasonality = False,
yearly_seasonality = False,
seasonality_prior_scale = s_prior,
holidays_prior_scale = h_prior,
changepoint_prior_scale = c_prior
)
m = m.add_seasonality(
name = 'weekly',
period=7,
fourier_order = wseas)
m = m.add_seasonality(
name = 'monthly',
period=30.5,
fourier_order = mseas)
m = m.add_seasonality(
name = 'yearly',
period=365.25,
fourier_order = yseas)
return m
# function to measure error (not the error function used in the kaggle competition)
def error(list1, list2):
return sum((x-y)**2 for x,y in zip(list(list1), list(list2))) / len(list(list1))
#function to execute train-validate-test approach to a column (one product)
def onecolfcst(current_col):
current_data = pd.DataFrame({
'ds' : train_dates,
'y' : train[current_col]})
best_error = np.inf
best_params = ()
best_val_forecast = 0
for pars in params:
m = build_model(pars)
m.fit(current_data)
future = pd.DataFrame({
'ds': test_dates
})
forecast = m.predict(future)
curerror = error(forecast['yhat'], test[current_col])
if curerror < best_error:
best_error = error(forecast['yhat'], test[current_col])
best_params = pars
best_val_forecast = forecast
# fit finql
m = build_model(best_params)
current_data = pd.DataFrame({
'ds' : list(train_dates) + list(test_dates),
'y' : list(train[current_col]) + list(test[current_col])})
m.fit(current_data)
realfuture = pd.DataFrame({
'ds': predict_dates
})
realforecast = m.predict(future)
result = {
'valforecast': [current_col + '_validation'] + list(forecast['yhat']),
'realforecast': [current_col + '_evaluation'] + list(realforecast['yhat']),
'best_error': best_error,
'best_params': best_params
}
return result
# parallel execution of the forecast on each of the columns
p = Pool(cpu_count())
predictions = list(tqdm(p.imap(onecolfcst, train.columns)))
p.close()
p.join()
#concat all validation forecast (they have to be submitted for the public leaderboard)
valforecasts = [x['valforecast'] for x in predictions]
valforecastsdf = pd.DataFrame(valforecasts)
valforecastsdf.iloc[:,1:] = valforecastsdf.iloc[:,1:].applymap(lambda x: max(0,x))
valforecastsdf.columns = ['id'] + ['F' + str(i) for i in range(1,29)]
#concat all test forecast (they have to be submitted for the private leaderboard)
realforecasts = [x['realforecast'] for x in predictions]
realforecasts = pd.DataFrame(realforecasts)
realforecasts.iloc[:,1:] = realforecasts.iloc[:,1:].applymap(lambda x: max(0,x))
realforecasts.columns = ['id'] + ['F' + str(i) for i in range(1,29)]
# concat validation and test into a submission csv
concatted = pd.concat([valforecastsdf, realforecasts], axis = 0)
concatted.to_csv('newestsubmission.csv', index = None)
# export all errors and params to csv to have a better idea of what to use for next time
errors_and_params = [[x['valforecast'][0],x['best_error'], x['best_params'][0], x['best_params'][1], x['best_params'][2], x['best_params'][3], x['best_params'][4], x['best_params'][5]] for x in predictions]
errors_and_params = pd.DataFrame(errors_and_params)
errors_and_params.columns = ['id', 'best_error','fourweek', 'fourmonth', 'fouryear', 's_prior', 'h_prior', 'c_prior']
errors_and_params.to_csv('errors_and_params.csv', index = None)