# Statistical Baselines

In [1]:
import pandas as pd
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVR
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet

In [2]:
test_df = pd.read_csv("../../data/prepared/test.csv")
train_df = pd.read_csv("../../data/prepared/train.csv")
valid_df = pd.read_csv("../../data/prepared/valid.csv")
unlabeled_df = pd.read_csv("../../data/prepared/unlabeled.csv")
bigger_train_df = pd.concat([train_df, valid_df], ignore_index=True)
results = {}

In [3]:
test_df

Unnamed: 0,ticket,problem_abstract,team,hours_to_resolve
0,304302318,GSSI EMEA Terminal Server Container NOT Resolv...,incm,5953
1,313179984,Credentials for new customer Siemens Energy,security,1
2,312790866,Request to provide Telco Inventory (BMP & WFA ...,advocacy,1311
3,311016034,ACTION Web will not show me router information,action,1
4,309247069,Trying to change password get popup with fatal...,gtac,117
...,...,...,...,...
901,310836587,GTAC Password Policies report,gtac,0
902,308609727,trying to set up a panorama id,gtac,384
903,305270833,Please decom GTAC domain,front door,1443
904,313358375,Missing USO Details,integrated experience platform (ixp),476


## Baseline

In [4]:
baseline_classifier = DummyRegressor(strategy='mean')
baseline_classifier.fit(train_df["problem_abstract"], train_df["hours_to_resolve"])
y_pred = baseline_classifier.predict(test_df["problem_abstract"])
results["dummy mean"] = mean_squared_error(test_df["hours_to_resolve"], y_pred, squared=False)
print("Root Mean Squared Error (RMSE):", results["dummy mean"])

baseline_classifier = DummyRegressor(strategy='median')
baseline_classifier.fit(train_df["problem_abstract"], train_df["hours_to_resolve"])
y_pred = baseline_classifier.predict(test_df["problem_abstract"])
results["dummy median"] = mean_squared_error(test_df["hours_to_resolve"], y_pred, squared=False)
print("Root Mean Squared Error (RMSE):", results["dummy median"])

Root Mean Squared Error (RMSE): 2375.462056085861
Root Mean Squared Error (RMSE): 2474.2800560578708


## Linear Regression

In [5]:
# Fit tfidf transformer on unlabeled and train data
train_unlabeled_df = pd.concat([train_df, unlabeled_df], ignore_index=True)
tfidf = TfidfVectorizer().fit(train_unlabeled_df['problem_abstract'].apply(lambda s: "" if s is None or pd.isna(s) else s))

# Transform train and test data
X_train = tfidf.transform(train_df["problem_abstract"].apply(lambda s: "" if s is None or pd.isna(s) else s))
y_train = train_df['hours_to_resolve']
X_test = tfidf.transform(test_df['problem_abstract'].apply(lambda s: "" if s is None or pd.isna(s) else s))
y_test = test_df['hours_to_resolve']

In [6]:
def finetune(X_train, y_train, model, param_grid, metric, cv=5, verbose=2):

    grid_search = GridSearchCV(model, param_grid, cv=5, scoring=metric, verbose=verbose)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    return best_model, best_params

def evaluate(X_test, y_test, model):

    y_pred = model.predict(X_test)
    metric = mean_squared_error(y_test, y_pred, squared=False)
    return metric

model, model_params = finetune(
    X_train, 
    y_train,
    ElasticNet(random_state=42),
    {
        "alpha": [0.01, 0.1, 1.0, 10.0],
        "l1_ratio": [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    },
    "neg_root_mean_squared_error",
    cv=5,
    verbose=2
    )
print("Best Hyperparameters:", model_params)
results["linear regression"] = evaluate(X_test, y_test, model)
print("Root Mean Squared Error (RMSE):", results["linear regression"])

Fitting 5 folds for each of 44 candidates, totalling 220 fits


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.0; total time=  24.6s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.0; total time=  24.6s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.0; total time=  24.2s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.0; total time=  24.2s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.0; total time=  23.9s
[CV] END ...........................alpha=0.01, l1_ratio=0.1; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.1; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.1; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.1; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.1; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.2; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.2; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.2; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.2; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.2; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.3; total time=   0.2s
[CV] END ...................

  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=1.0; total time=  12.7s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=1.0; total time=  12.7s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=1.0; total time=  12.5s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=1.0; total time=  12.4s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=1.0; total time=  12.4s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.0; total time=  17.5s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.0; total time=  17.5s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.0; total time=  17.3s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.0; total time=  17.3s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.0; total time=  17.1s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.2; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.2; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.2; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.2; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.2; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.3; total time=   0.1s
[CV] END ...................

  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=1.0; total time=  10.7s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=1.0; total time=  10.7s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=1.0; total time=  10.5s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=1.0; total time=  10.5s
[CV] END ............................alpha=0.1, l1_ratio=1.0; total time=   7.0s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ............................alpha=1.0, l1_ratio=0.0; total time=  17.5s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ............................alpha=1.0, l1_ratio=0.0; total time=  17.4s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ............................alpha=1.0, l1_ratio=0.0; total time=  17.2s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ............................alpha=1.0, l1_ratio=0.0; total time=  17.2s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ............................alpha=1.0, l1_ratio=0.0; total time=  16.9s
[CV] END ............................alpha=1.0, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=1.0, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=1.0, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=1.0, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=1.0, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=1.0, l1_ratio=0.2; total time=   0.1s
[CV] END ............................alpha=1.0, l1_ratio=0.2; total time=   0.1s
[CV] END ............................alpha=1.0, l1_ratio=0.2; total time=   0.1s
[CV] END ............................alpha=1.0, l1_ratio=0.2; total time=   0.1s
[CV] END ............................alpha=1.0, l1_ratio=0.2; total time=   0.1s
[CV] END ............................alpha=1.0, l1_ratio=0.3; total time=   0.1s
[CV] END ...................

  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ...........................alpha=10.0, l1_ratio=0.0; total time=  17.4s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ...........................alpha=10.0, l1_ratio=0.0; total time=  17.4s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ...........................alpha=10.0, l1_ratio=0.0; total time=  17.0s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ...........................alpha=10.0, l1_ratio=0.0; total time=  17.0s


  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END ...........................alpha=10.0, l1_ratio=0.0; total time=  16.7s
[CV] END ...........................alpha=10.0, l1_ratio=0.1; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.1; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.1; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.1; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.1; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.2; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.2; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.2; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.2; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.2; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.3; total time=   0.0s
[CV] END ...................

## Support Vector Regression

In [7]:
model, model_params = finetune(
    X_train, 
    y_train,
    LinearSVR(random_state=42, epsilon=0),
    {
        "loss": ["squared_epsilon_insensitive", "squared_epsilon_sensitive"],
        "intercept_scaling": [0.001, 0.01, 0.1, 1.0, 10.0, 100]
    },
    "neg_root_mean_squared_error",
    cv=5,
    verbose=2
    )
print("Best Hyperparameters:", model_params)
results["linear svm"] = evaluate(X_test, y_test, model)
print("Root Mean Squared Error (RMSE):", results["linear svm"])

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END intercept_scaling=0.001, loss=squared_epsilon_insensitive; total time=   0.0s
[CV] END intercept_scaling=0.001, loss=squared_epsilon_insensitive; total time=   0.0s
[CV] END intercept_scaling=0.001, loss=squared_epsilon_insensitive; total time=   0.0s
[CV] END intercept_scaling=0.001, loss=squared_epsilon_insensitive; total time=   0.0s
[CV] END intercept_scaling=0.001, loss=squared_epsilon_insensitive; total time=   0.0s
[CV] END intercept_scaling=0.001, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.001, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.001, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.001, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.001, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.01, loss=squared_epsilon_insensitive; total time= 



[CV] END intercept_scaling=10.0, loss=squared_epsilon_insensitive; total time=   0.1s
[CV] END intercept_scaling=10.0, loss=squared_epsilon_insensitive; total time=   0.1s




[CV] END intercept_scaling=10.0, loss=squared_epsilon_insensitive; total time=   0.1s
[CV] END intercept_scaling=10.0, loss=squared_epsilon_insensitive; total time=   0.1s




[CV] END intercept_scaling=10.0, loss=squared_epsilon_insensitive; total time=   0.1s
[CV] END intercept_scaling=10.0, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=10.0, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=10.0, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=10.0, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=10.0, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=100, loss=squared_epsilon_insensitive; total time=   0.1s




[CV] END intercept_scaling=100, loss=squared_epsilon_insensitive; total time=   0.1s
[CV] END intercept_scaling=100, loss=squared_epsilon_insensitive; total time=   0.1s


30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/var/tmp/xmacko1/bachelor_thesis/.pyenv/versions/3.8.9/envs/ticketing-system/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/var/tmp/xmacko1/bachelor_thesis/.pyenv/versions/3.8.9/envs/ticketing-system/lib/python3.8/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/var/tmp/xmacko1/bachelor_thesis/.pyenv/versions/3.8.9/envs/ticketing-system/lib/python3.8/site-packages/sklearn/base.py", line 637, in _validate_params
    val

[CV] END intercept_scaling=100, loss=squared_epsilon_insensitive; total time=   0.1s
[CV] END intercept_scaling=100, loss=squared_epsilon_insensitive; total time=   0.1s
[CV] END intercept_scaling=100, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=100, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=100, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=100, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=100, loss=squared_epsilon_sensitive; total time=   0.0s
Best Hyperparameters: {'intercept_scaling': 10.0, 'loss': 'squared_epsilon_insensitive'}
Root Mean Squared Error (RMSE): 2193.780371373904




## Random Forest Regression

In [8]:
model, model_params = finetune(
    X_train, 
    y_train,
    RandomForestRegressor(random_state=42, n_jobs=-1),
    {
        "min_samples_split": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
        "min_samples_leaf": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
        "max_samples": [0.6, 0.7, 0.8, 0.9, 1.0]
    },
    "neg_root_mean_squared_error",
    cv=5,
    verbose=2
    )
print("Best Hyperparameters:", model_params)
results["random forest"] = evaluate(X_test, y_test, model)
print("Root Mean Squared Error (RMSE):", results["random forest"])

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=2; total time=   0.7s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=2; total time=   0.7s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=2; total time=   0.7s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=2; total time=   0.7s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=2; total time=   0.7s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=4; total time=   0.7s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=4; total time=   0.7s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=4; total time=   0.7s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=4; total time=   0.8s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=4; total time=   0.7s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=6; total tim

## Evaluation

In [9]:
pd.DataFrame.from_dict(results, orient="index", columns=["rmse"])

Unnamed: 0,rmse
dummy mean,2375.462056
dummy median,2474.280056
linear regression,2125.915038
linear svm,2193.780371
random forest,2204.708242
