import pandas as pd
import numpy as np
import pandas as pd
from fbprophet import Prophet
import datetime as dt
from sklearn import metrics
import math
import itertools
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

Import Calendar¶

calendar = pd.read_csv('calendar.csv')

calendar

Import Sales Data¶

sales_data = pd.read_csv('https://tmpkaggle.s3.me-south-1.amazonaws.com/sales_train_evaluation.csv')
Xcols = ['d_' + str(i) for i in range(1, 1914)]
ycols = ['d_' + str(i) for i in range(1914, 1942)]


ids = sales_data.item_id + '_' + sales_data.store_id

train = sales_data[Xcols]
train = train.T
train.columns = ids
#train

test = sales_data[ycols]
test = test.T
test.columns = ids
#test

train

test

Set the Grid for Gridsearch¶

params = [[3, 5, 10, 0.5, 0.5, 0.5],[15, 25, 50, 0.1, 0.1, 0.1],[20, 30, 60, 0.3, 0.3, 0.3],[2, 4, 8, 0.8, 0.8, 0.8],
         [2, 2, 4, 0.8, 0.8, 0.8]]

list(params)

[[3, 5, 10, 0.5, 0.5, 0.5],
 [15, 25, 50, 0.1, 0.1, 0.1],
 [20, 30, 60, 0.3, 0.3, 0.3],
 [2, 4, 8, 0.8, 0.8, 0.8],
 [2, 2, 4, 0.8, 0.8, 0.8]]

# prepare calendar data
calendar['datelist'] = calendar['date'].apply(lambda x: [x])
calendarnew = pd.DataFrame(calendar.groupby('event_name_1')['datelist'].sum()).merge(pd.DataFrame(calendar.groupby('event_name_2')['datelist'].sum()),
                                                                       left_index = True, right_index = True, how = 'outer')
calendarnew.datelist_y = [x if type(x) == list else [] for x in calendarnew.datelist_y  ]
calendarnew.datelist_x = calendarnew.datelist_x + calendarnew.datelist_y
calendarnew = calendarnew.drop('datelist_y', 1)

# prepare holidays data
holidayslist = []
for i, row in calendarnew.iterrows():
    holidayslist.append(
        pd.DataFrame({
            'holiday': i,
            'ds': pd.to_datetime(list(row)[0]),
            'lower_window': 0,
            'upper_window': 1
        })
    )
holidaysconcat = pd.concat(holidayslist)


# function to build one model
def build_model(pars):
    wseas, mseas, yseas, s_prior, h_prior, c_prior= pars
    m = Prophet(growth = 'linear',
                    holidays = holidaysconcat,
                   daily_seasonality = False,
                   weekly_seasonality = False,
                    yearly_seasonality = False,
                    seasonality_prior_scale = s_prior,
                    holidays_prior_scale = h_prior,
                    changepoint_prior_scale = c_prior
                    
                   )
        


    m = m.add_seasonality(
        name = 'weekly',
        period=7,
        fourier_order = wseas)


    m = m.add_seasonality(
        name = 'monthly',
        period=30.5,
        fourier_order = mseas)


    m = m.add_seasonality(
        name = 'yearly',
        period=365.25,
        fourier_order = yseas)

    return m

# function to measure error (not the error function used in the kaggle competition)
def error(list1, list2):
    return sum((x-y)**2 for x,y in zip(list(list1), list(list2))) / len(list(list1))

#function to execute train-validate-test approach to a column (one product)
def onecolfcst(current_col):    

    current_data = pd.DataFrame({
        'ds' : train_dates,
        'y' : train[current_col]})

    best_error = np.inf
    best_params = ()
    best_val_forecast = 0

    for pars in params:
        
        m = build_model(pars)   
        m.fit(current_data)

        future = pd.DataFrame({
            'ds': test_dates
        })
        forecast = m.predict(future)
        
        curerror = error(forecast['yhat'], test[current_col])
        
        if curerror < best_error:
            best_error = error(forecast['yhat'], test[current_col])
            best_params = pars
            best_val_forecast = forecast
    
    # fit finql
    m = build_model(best_params)   
    current_data = pd.DataFrame({
        'ds' : list(train_dates) + list(test_dates),
        'y' : list(train[current_col]) + list(test[current_col])})
    m.fit(current_data)
    
    realfuture = pd.DataFrame({
            'ds': predict_dates
        })
    realforecast = m.predict(future)
    
    
    result = {
        'valforecast': [current_col + '_validation'] + list(forecast['yhat']),
        'realforecast': [current_col + '_evaluation'] + list(realforecast['yhat']),
         'best_error': best_error,
        'best_params': best_params
                                                            
    }

    return result

# parallel execution of the forecast on each of the columns
p = Pool(cpu_count())
predictions = list(tqdm(p.imap(onecolfcst, train.columns)))
p.close()
p.join()

30490it [14:37:35,  1.73s/it]

#concat all validation forecast (they have to be submitted for the public leaderboard)
valforecasts = [x['valforecast'] for x in predictions]
valforecastsdf = pd.DataFrame(valforecasts)
valforecastsdf.iloc[:,1:] = valforecastsdf.iloc[:,1:].applymap(lambda x: max(0,x))
valforecastsdf.columns = ['id'] + ['F' + str(i) for i in range(1,29)]

#concat all test forecast (they have to be submitted for the private leaderboard)
realforecasts = [x['realforecast'] for x in predictions]
realforecasts = pd.DataFrame(realforecasts)
realforecasts.iloc[:,1:] = realforecasts.iloc[:,1:].applymap(lambda x: max(0,x))
realforecasts.columns = ['id'] + ['F' + str(i) for i in range(1,29)]

# concat validation and test into a submission csv
concatted = pd.concat([valforecastsdf, realforecasts], axis = 0)
concatted.to_csv('newestsubmission.csv', index = None)

# export all errors and params to csv to have a better idea of what to use for next time
errors_and_params = [[x['valforecast'][0],x['best_error'], x['best_params'][0], x['best_params'][1], x['best_params'][2], x['best_params'][3], x['best_params'][4], x['best_params'][5]]  for x in predictions]
errors_and_params = pd.DataFrame(errors_and_params)
errors_and_params.columns = ['id', 'best_error','fourweek', 'fourmonth', 'fouryear', 's_prior', 'h_prior', 'c_prior']
errors_and_params.to_csv('errors_and_params.csv', index = None)

	date	wm_yr_wk	weekday	wday	month	year	d	event_name_1	event_type_1	event_name_2	event_type_2	snap_CA	snap_TX	snap_WI
0	2011-01-29	11101	Saturday	1	1	2011	d_1	NaN	NaN	NaN	NaN	0	0	0
1	2011-01-30	11101	Sunday	2	1	2011	d_2	NaN	NaN	NaN	NaN	0	0	0
2	2011-01-31	11101	Monday	3	1	2011	d_3	NaN	NaN	NaN	NaN	0	0	0
3	2011-02-01	11101	Tuesday	4	2	2011	d_4	NaN	NaN	NaN	NaN	1	1	0
4	2011-02-02	11101	Wednesday	5	2	2011	d_5	NaN	NaN	NaN	NaN	1	0	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1964	2016-06-15	11620	Wednesday	5	6	2016	d_1965	NaN	NaN	NaN	NaN	0	1	1
1965	2016-06-16	11620	Thursday	6	6	2016	d_1966	NaN	NaN	NaN	NaN	0	0	0
1966	2016-06-17	11620	Friday	7	6	2016	d_1967	NaN	NaN	NaN	NaN	0	0	0
1967	2016-06-18	11621	Saturday	1	6	2016	d_1968	NaN	NaN	NaN	NaN	0	0	0
1968	2016-06-19	11621	Sunday	2	6	2016	d_1969	NBAFinalsEnd	Sporting	Father's day	Cultural	0	0	0

	HOBBIES_1_001_CA_1	HOBBIES_1_002_CA_1	HOBBIES_1_003_CA_1	HOBBIES_1_004_CA_1	HOBBIES_1_005_CA_1	HOBBIES_1_006_CA_1	HOBBIES_1_007_CA_1	HOBBIES_1_008_CA_1	HOBBIES_1_009_CA_1	HOBBIES_1_010_CA_1	...	FOODS_3_818_WI_3	FOODS_3_819_WI_3	FOODS_3_820_WI_3	FOODS_3_821_WI_3	FOODS_3_822_WI_3	FOODS_3_823_WI_3	FOODS_3_824_WI_3	FOODS_3_825_WI_3	FOODS_3_826_WI_3	FOODS_3_827_WI_3
d_1	0	0	0	0	0	0	0	12	2	0	...	0	14	1	0	4	0	0	0	0	0
d_2	0	0	0	0	0	0	0	15	0	0	...	0	11	1	0	4	0	0	6	0	0
d_3	0	0	0	0	0	0	0	0	7	1	...	0	5	1	0	2	2	0	0	0	0
d_4	0	0	0	0	0	0	0	0	3	0	...	0	6	1	0	5	2	0	2	0	0
d_5	0	0	0	0	0	0	0	0	0	0	...	0	5	1	0	2	0	0	2	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
d_1909	1	1	1	0	1	0	1	4	0	0	...	4	1	1	0	0	0	0	1	1	0
d_1910	3	0	0	1	2	0	0	6	0	0	...	2	3	3	0	2	1	0	0	0	0
d_1911	0	0	1	3	2	2	0	3	0	2	...	0	1	6	0	3	0	0	0	3	0
d_1912	1	0	1	7	2	0	1	2	0	0	...	3	0	0	4	2	0	1	1	1	0
d_1913	1	0	1	2	4	0	1	1	0	2	...	1	2	1	0	1	1	0	0	3	0

	HOBBIES_1_001_CA_1	HOBBIES_1_002_CA_1	HOBBIES_1_003_CA_1	HOBBIES_1_004_CA_1	HOBBIES_1_005_CA_1	HOBBIES_1_006_CA_1	HOBBIES_1_007_CA_1	HOBBIES_1_008_CA_1	HOBBIES_1_009_CA_1	HOBBIES_1_010_CA_1	...	FOODS_3_818_WI_3	FOODS_3_819_WI_3	FOODS_3_820_WI_3	FOODS_3_821_WI_3	FOODS_3_822_WI_3	FOODS_3_823_WI_3	FOODS_3_824_WI_3	FOODS_3_825_WI_3	FOODS_3_826_WI_3	FOODS_3_827_WI_3
d_1914	0	0	0	0	1	0	0	19	0	0	...	0	4	2	0	0	0	0	0	1	0
d_1915	0	1	0	0	0	0	0	3	2	2	...	4	2	2	0	2	0	1	0	3	0
d_1916	0	0	1	1	2	1	0	2	6	1	...	0	0	1	0	2	0	1	1	0	0
d_1917	2	0	1	2	3	0	0	8	0	0	...	1	0	1	3	2	2	1	1	1	0
d_1918	0	0	0	4	1	0	1	8	0	0	...	2	1	1	1	2	2	0	0	2	0
d_1919	3	0	2	1	0	2	0	23	0	0	...	1	2	1	2	5	0	0	2	1	1
d_1920	5	0	1	6	3	4	1	26	0	0	...	3	6	0	0	5	0	0	1	0	1
d_1921	0	0	0	4	2	1	0	9	0	0	...	3	2	1	0	4	0	0	1	2	1
d_1922	0	0	0	0	3	0	1	4	0	0	...	4	2	1	1	2	2	1	0	1	2
d_1923	1	1	0	0	1	0	0	8	0	0	...	4	0	1	1	3	0	0	0	1	0
d_1924	1	0	0	0	1	0	0	5	8	0	...	0	0	2	0	1	0	2	1	2	1
d_1925	0	0	2	2	3	2	0	8	0	2	...	1	4	2	0	0	1	0	0	0	3
d_1926	2	0	1	2	2	1	1	5	0	0	...	0	3	1	0	2	1	0	0	2	2
d_1927	1	0	3	4	3	0	0	0	0	2	...	0	0	3	3	1	1	0	1	1	1
d_1928	2	0	0	2	2	0	0	3	0	1	...	3	0	3	4	0	0	0	0	1	1
d_1929	2	0	0	1	2	3	0	1	0	0	...	1	1	1	1	2	0	0	3	0	0
d_1930	1	0	1	1	2	0	0	18	0	0	...	2	2	1	0	0	1	0	3	2	0
d_1931	0	0	0	1	2	5	0	47	0	1	...	3	1	1	2	2	0	0	1	1	0
d_1932	2	0	1	1	0	2	0	7	1	0	...	4	2	4	2	1	1	0	0	1	1
d_1933	4	1	0	1	0	1	1	0	0	0	...	4	1	0	2	2	0	0	0	1	2
d_1934	0	2	2	0	0	0	0	6	0	1	...	3	2	5	0	5	3	0	1	1	0
d_1935	0	1	0	4	2	0	0	0	0	0	...	4	4	1	2	3	0	0	2	4	5
d_1936	0	1	0	0	1	1	0	15	0	2	...	2	1	0	0	2	1	0	0	6	4
d_1937	0	0	0	1	0	0	1	5	0	1	...	3	6	3	0	0	1	0	1	0	0
d_1938	3	0	2	3	0	0	0	4	0	1	...	1	4	3	1	2	0	1	0	1	2
d_1939	3	0	3	0	2	5	1	1	0	0	...	3	4	3	1	1	0	0	1	1	2
d_1940	0	0	0	2	1	2	1	40	1	0	...	0	1	0	0	3	1	1	0	1	5
d_1941	1	0	1	6	0	0	0	32	0	1	...	0	1	1	4	4	1	0	2	0	1