In [1]:
import pandas as pd
import numpy as np
import pandas as pd
from fbprophet import Prophet
import datetime as dt
from sklearn import metrics
import math
import itertools
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

Import Calendar

In [2]:
calendar = pd.read_csv('calendar.csv')
In [3]:
calendar
Out[3]:
date wm_yr_wk weekday wday month year d event_name_1 event_type_1 event_name_2 event_type_2 snap_CA snap_TX snap_WI
0 2011-01-29 11101 Saturday 1 1 2011 d_1 NaN NaN NaN NaN 0 0 0
1 2011-01-30 11101 Sunday 2 1 2011 d_2 NaN NaN NaN NaN 0 0 0
2 2011-01-31 11101 Monday 3 1 2011 d_3 NaN NaN NaN NaN 0 0 0
3 2011-02-01 11101 Tuesday 4 2 2011 d_4 NaN NaN NaN NaN 1 1 0
4 2011-02-02 11101 Wednesday 5 2 2011 d_5 NaN NaN NaN NaN 1 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1964 2016-06-15 11620 Wednesday 5 6 2016 d_1965 NaN NaN NaN NaN 0 1 1
1965 2016-06-16 11620 Thursday 6 6 2016 d_1966 NaN NaN NaN NaN 0 0 0
1966 2016-06-17 11620 Friday 7 6 2016 d_1967 NaN NaN NaN NaN 0 0 0
1967 2016-06-18 11621 Saturday 1 6 2016 d_1968 NaN NaN NaN NaN 0 0 0
1968 2016-06-19 11621 Sunday 2 6 2016 d_1969 NBAFinalsEnd Sporting Father's day Cultural 0 0 0

1969 rows × 14 columns

Import Sales Data

In [13]:
sales_data = pd.read_csv('https://tmpkaggle.s3.me-south-1.amazonaws.com/sales_train_evaluation.csv')
Xcols = ['d_' + str(i) for i in range(1, 1914)]
ycols = ['d_' + str(i) for i in range(1914, 1942)]


ids = sales_data.item_id + '_' + sales_data.store_id

train = sales_data[Xcols]
train = train.T
train.columns = ids
#train

test = sales_data[ycols]
test = test.T
test.columns = ids
#test
In [14]:
train
Out[14]:
HOBBIES_1_001_CA_1 HOBBIES_1_002_CA_1 HOBBIES_1_003_CA_1 HOBBIES_1_004_CA_1 HOBBIES_1_005_CA_1 HOBBIES_1_006_CA_1 HOBBIES_1_007_CA_1 HOBBIES_1_008_CA_1 HOBBIES_1_009_CA_1 HOBBIES_1_010_CA_1 ... FOODS_3_818_WI_3 FOODS_3_819_WI_3 FOODS_3_820_WI_3 FOODS_3_821_WI_3 FOODS_3_822_WI_3 FOODS_3_823_WI_3 FOODS_3_824_WI_3 FOODS_3_825_WI_3 FOODS_3_826_WI_3 FOODS_3_827_WI_3
d_1 0 0 0 0 0 0 0 12 2 0 ... 0 14 1 0 4 0 0 0 0 0
d_2 0 0 0 0 0 0 0 15 0 0 ... 0 11 1 0 4 0 0 6 0 0
d_3 0 0 0 0 0 0 0 0 7 1 ... 0 5 1 0 2 2 0 0 0 0
d_4 0 0 0 0 0 0 0 0 3 0 ... 0 6 1 0 5 2 0 2 0 0
d_5 0 0 0 0 0 0 0 0 0 0 ... 0 5 1 0 2 0 0 2 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
d_1909 1 1 1 0 1 0 1 4 0 0 ... 4 1 1 0 0 0 0 1 1 0
d_1910 3 0 0 1 2 0 0 6 0 0 ... 2 3 3 0 2 1 0 0 0 0
d_1911 0 0 1 3 2 2 0 3 0 2 ... 0 1 6 0 3 0 0 0 3 0
d_1912 1 0 1 7 2 0 1 2 0 0 ... 3 0 0 4 2 0 1 1 1 0
d_1913 1 0 1 2 4 0 1 1 0 2 ... 1 2 1 0 1 1 0 0 3 0

1913 rows × 30490 columns

In [15]:
test
Out[15]:
HOBBIES_1_001_CA_1 HOBBIES_1_002_CA_1 HOBBIES_1_003_CA_1 HOBBIES_1_004_CA_1 HOBBIES_1_005_CA_1 HOBBIES_1_006_CA_1 HOBBIES_1_007_CA_1 HOBBIES_1_008_CA_1 HOBBIES_1_009_CA_1 HOBBIES_1_010_CA_1 ... FOODS_3_818_WI_3 FOODS_3_819_WI_3 FOODS_3_820_WI_3 FOODS_3_821_WI_3 FOODS_3_822_WI_3 FOODS_3_823_WI_3 FOODS_3_824_WI_3 FOODS_3_825_WI_3 FOODS_3_826_WI_3 FOODS_3_827_WI_3
d_1914 0 0 0 0 1 0 0 19 0 0 ... 0 4 2 0 0 0 0 0 1 0
d_1915 0 1 0 0 0 0 0 3 2 2 ... 4 2 2 0 2 0 1 0 3 0
d_1916 0 0 1 1 2 1 0 2 6 1 ... 0 0 1 0 2 0 1 1 0 0
d_1917 2 0 1 2 3 0 0 8 0 0 ... 1 0 1 3 2 2 1 1 1 0
d_1918 0 0 0 4 1 0 1 8 0 0 ... 2 1 1 1 2 2 0 0 2 0
d_1919 3 0 2 1 0 2 0 23 0 0 ... 1 2 1 2 5 0 0 2 1 1
d_1920 5 0 1 6 3 4 1 26 0 0 ... 3 6 0 0 5 0 0 1 0 1
d_1921 0 0 0 4 2 1 0 9 0 0 ... 3 2 1 0 4 0 0 1 2 1
d_1922 0 0 0 0 3 0 1 4 0 0 ... 4 2 1 1 2 2 1 0 1 2
d_1923 1 1 0 0 1 0 0 8 0 0 ... 4 0 1 1 3 0 0 0 1 0
d_1924 1 0 0 0 1 0 0 5 8 0 ... 0 0 2 0 1 0 2 1 2 1
d_1925 0 0 2 2 3 2 0 8 0 2 ... 1 4 2 0 0 1 0 0 0 3
d_1926 2 0 1 2 2 1 1 5 0 0 ... 0 3 1 0 2 1 0 0 2 2
d_1927 1 0 3 4 3 0 0 0 0 2 ... 0 0 3 3 1 1 0 1 1 1
d_1928 2 0 0 2 2 0 0 3 0 1 ... 3 0 3 4 0 0 0 0 1 1
d_1929 2 0 0 1 2 3 0 1 0 0 ... 1 1 1 1 2 0 0 3 0 0
d_1930 1 0 1 1 2 0 0 18 0 0 ... 2 2 1 0 0 1 0 3 2 0
d_1931 0 0 0 1 2 5 0 47 0 1 ... 3 1 1 2 2 0 0 1 1 0
d_1932 2 0 1 1 0 2 0 7 1 0 ... 4 2 4 2 1 1 0 0 1 1
d_1933 4 1 0 1 0 1 1 0 0 0 ... 4 1 0 2 2 0 0 0 1 2
d_1934 0 2 2 0 0 0 0 6 0 1 ... 3 2 5 0 5 3 0 1 1 0
d_1935 0 1 0 4 2 0 0 0 0 0 ... 4 4 1 2 3 0 0 2 4 5
d_1936 0 1 0 0 1 1 0 15 0 2 ... 2 1 0 0 2 1 0 0 6 4
d_1937 0 0 0 1 0 0 1 5 0 1 ... 3 6 3 0 0 1 0 1 0 0
d_1938 3 0 2 3 0 0 0 4 0 1 ... 1 4 3 1 2 0 1 0 1 2
d_1939 3 0 3 0 2 5 1 1 0 0 ... 3 4 3 1 1 0 0 1 1 2
d_1940 0 0 0 2 1 2 1 40 1 0 ... 0 1 0 0 3 1 1 0 1 5
d_1941 1 0 1 6 0 0 0 32 0 1 ... 0 1 1 4 4 1 0 2 0 1

28 rows × 30490 columns

Set the Grid for Gridsearch

In [20]:
params = [[3, 5, 10, 0.5, 0.5, 0.5],[15, 25, 50, 0.1, 0.1, 0.1],[20, 30, 60, 0.3, 0.3, 0.3],[2, 4, 8, 0.8, 0.8, 0.8],
         [2, 2, 4, 0.8, 0.8, 0.8]]
In [21]:
list(params)
Out[21]:
[[3, 5, 10, 0.5, 0.5, 0.5],
 [15, 25, 50, 0.1, 0.1, 0.1],
 [20, 30, 60, 0.3, 0.3, 0.3],
 [2, 4, 8, 0.8, 0.8, 0.8],
 [2, 2, 4, 0.8, 0.8, 0.8]]
In [25]:
# prepare calendar data
calendar['datelist'] = calendar['date'].apply(lambda x: [x])
calendarnew = pd.DataFrame(calendar.groupby('event_name_1')['datelist'].sum()).merge(pd.DataFrame(calendar.groupby('event_name_2')['datelist'].sum()),
                                                                       left_index = True, right_index = True, how = 'outer')
calendarnew.datelist_y = [x if type(x) == list else [] for x in calendarnew.datelist_y  ]
calendarnew.datelist_x = calendarnew.datelist_x + calendarnew.datelist_y
calendarnew = calendarnew.drop('datelist_y', 1)

# prepare holidays data
holidayslist = []
for i, row in calendarnew.iterrows():
    holidayslist.append(
        pd.DataFrame({
            'holiday': i,
            'ds': pd.to_datetime(list(row)[0]),
            'lower_window': 0,
            'upper_window': 1
        })
    )
holidaysconcat = pd.concat(holidayslist)


# function to build one model
def build_model(pars):
    wseas, mseas, yseas, s_prior, h_prior, c_prior= pars
    m = Prophet(growth = 'linear',
                    holidays = holidaysconcat,
                   daily_seasonality = False,
                   weekly_seasonality = False,
                    yearly_seasonality = False,
                    seasonality_prior_scale = s_prior,
                    holidays_prior_scale = h_prior,
                    changepoint_prior_scale = c_prior
                    
                   )
        


    m = m.add_seasonality(
        name = 'weekly',
        period=7,
        fourier_order = wseas)


    m = m.add_seasonality(
        name = 'monthly',
        period=30.5,
        fourier_order = mseas)


    m = m.add_seasonality(
        name = 'yearly',
        period=365.25,
        fourier_order = yseas)

    return m

# function to measure error (not the error function used in the kaggle competition)
def error(list1, list2):
    return sum((x-y)**2 for x,y in zip(list(list1), list(list2))) / len(list(list1))

#function to execute train-validate-test approach to a column (one product)
def onecolfcst(current_col):    

    current_data = pd.DataFrame({
        'ds' : train_dates,
        'y' : train[current_col]})

    best_error = np.inf
    best_params = ()
    best_val_forecast = 0

    for pars in params:
        
        m = build_model(pars)   
        m.fit(current_data)

        future = pd.DataFrame({
            'ds': test_dates
        })
        forecast = m.predict(future)
        
        curerror = error(forecast['yhat'], test[current_col])
        
        if curerror < best_error:
            best_error = error(forecast['yhat'], test[current_col])
            best_params = pars
            best_val_forecast = forecast
    
    # fit finql
    m = build_model(best_params)   
    current_data = pd.DataFrame({
        'ds' : list(train_dates) + list(test_dates),
        'y' : list(train[current_col]) + list(test[current_col])})
    m.fit(current_data)
    
    realfuture = pd.DataFrame({
            'ds': predict_dates
        })
    realforecast = m.predict(future)
    
    
    result = {
        'valforecast': [current_col + '_validation'] + list(forecast['yhat']),
        'realforecast': [current_col + '_evaluation'] + list(realforecast['yhat']),
         'best_error': best_error,
        'best_params': best_params
                                                            
    }

    return result

# parallel execution of the forecast on each of the columns
p = Pool(cpu_count())
predictions = list(tqdm(p.imap(onecolfcst, train.columns)))
p.close()
p.join()
30490it [14:37:35,  1.73s/it]
In [26]:
 
In [27]:
#concat all validation forecast (they have to be submitted for the public leaderboard)
valforecasts = [x['valforecast'] for x in predictions]
valforecastsdf = pd.DataFrame(valforecasts)
valforecastsdf.iloc[:,1:] = valforecastsdf.iloc[:,1:].applymap(lambda x: max(0,x))
valforecastsdf.columns = ['id'] + ['F' + str(i) for i in range(1,29)]
In [40]:
#concat all test forecast (they have to be submitted for the private leaderboard)
realforecasts = [x['realforecast'] for x in predictions]
realforecasts = pd.DataFrame(realforecasts)
realforecasts.iloc[:,1:] = realforecasts.iloc[:,1:].applymap(lambda x: max(0,x))
realforecasts.columns = ['id'] + ['F' + str(i) for i in range(1,29)]
In [45]:
# concat validation and test into a submission csv
concatted = pd.concat([valforecastsdf, realforecasts], axis = 0)
concatted.to_csv('newestsubmission.csv', index = None)
In [63]:
# export all errors and params to csv to have a better idea of what to use for next time
errors_and_params = [[x['valforecast'][0],x['best_error'], x['best_params'][0], x['best_params'][1], x['best_params'][2], x['best_params'][3], x['best_params'][4], x['best_params'][5]]  for x in predictions]
errors_and_params = pd.DataFrame(errors_and_params)
errors_and_params.columns = ['id', 'best_error','fourweek', 'fourmonth', 'fouryear', 's_prior', 'h_prior', 'c_prior']
errors_and_params.to_csv('errors_and_params.csv', index = None)