# Word Embeddings

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVR
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from gensim.models import KeyedVectors
import gensim
import gensim.downloader as api

In [2]:
test_df = pd.read_csv("../../data/prepared/test.csv")
train_df = pd.read_csv("../../data/prepared/train.csv")
valid_df = pd.read_csv("../../data/prepared/valid.csv")
unlabeled_df = pd.read_csv("../../data/prepared/unlabeled.csv")
bigger_train_df = pd.concat([train_df, valid_df], ignore_index=True)
results = {}

## Baseline

In [3]:
baseline_classifier = DummyRegressor(strategy='mean')
baseline_classifier.fit(train_df["problem_abstract"], train_df["hours_to_resolve"])
y_pred = baseline_classifier.predict(test_df["problem_abstract"])
results["dummy mean"] = mean_squared_error(test_df["hours_to_resolve"], y_pred, squared=False)
print("Root Mean Squared Error (RMSE):", results["dummy mean"])

baseline_classifier = DummyRegressor(strategy='median')
baseline_classifier.fit(train_df["problem_abstract"], train_df["hours_to_resolve"])
y_pred = baseline_classifier.predict(test_df["problem_abstract"])
results["dummy median"] = mean_squared_error(test_df["hours_to_resolve"], y_pred, squared=False)
print("Root Mean Squared Error (RMSE):", results["dummy median"])

Root Mean Squared Error (RMSE): 2375.462056085861
Root Mean Squared Error (RMSE): 2474.2800560578708


## Linear Regression

In [4]:
# Function to convert text into averaged word embeddings
def text_to_avg_vector(text, model):
    words = text.split()
    words_in_vocab = [word for word in words if word in model.key_to_index]
    if len(words_in_vocab) == 0:
        return np.zeros(model.vector_size)
    avg_vector = np.mean([model[word] for word in words_in_vocab], axis=0)
    return avg_vector

def transform_with_embeddings(df, model):
    return np.array(df['problem_abstract'].apply(lambda s: text_to_avg_vector("" if s is None or pd.isna(s) else s, model)).tolist())

def finetune(X_train, y_train, model, param_grid, metric, cv=5, verbose=2):

    grid_search = GridSearchCV(model, param_grid, cv=5, scoring=metric, verbose=verbose)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    return best_model, best_params

def evaluate(X_test, y_test, model):

    y_pred = model.predict(X_test)
    metric = f1_score(y_test, y_pred, average="micro")
    return metric

word2vec_model = KeyedVectors.load_word2vec_format('../../models/GoogleNews-vectors-negative300.bin', binary=True)
glove_model = KeyedVectors.load_word2vec_format('../../models/glove.840B.300d.txt', binary=False, no_header=True)
# Transform train and test data
X_train_word2vec = transform_with_embeddings(train_df, word2vec_model)
X_train_glove = transform_with_embeddings(train_df, glove_model)
y_train = train_df['hours_to_resolve']
X_test_word2vec = transform_with_embeddings(test_df, word2vec_model)
X_test_glove = transform_with_embeddings(test_df, glove_model)
y_test = test_df['hours_to_resolve']

In [5]:
def finetune(X_train, y_train, model, param_grid, metric, cv=5, verbose=2):

    grid_search = GridSearchCV(model, param_grid, cv=5, scoring=metric, verbose=verbose)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    return best_model, best_params

def evaluate(X_test, y_test, model):

    y_pred = model.predict(X_test)
    metric = mean_squared_error(y_test, y_pred, squared=False)
    return metric

In [6]:
model, model_params = finetune(
    X_train_word2vec, 
    y_train,
    ElasticNet(random_state=42),
    {
        "alpha": [0.01, 0.1, 1.0, 10.0],
        "l1_ratio": [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    },
    "neg_root_mean_squared_error",
    cv=5,
    verbose=2
    )
print("Best Hyperparameters:", model_params)
results["linear regression word2vec"] = evaluate(X_test_word2vec, y_test, model)
print("Root Mean Squared Error (RMSE):", results["linear regression word2vec"])

Fitting 5 folds for each of 44 candidates, totalling 220 fits


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.0; total time=   3.6s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.0; total time=   2.9s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.0; total time=   2.6s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.0; total time=   2.6s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.0; total time=   3.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.1; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.1; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.1; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.1; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.1; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.2; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.2; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.2; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.2; total time=   0.4s
[CV] END ...........................alpha=0.01, l1_ratio=0.2; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.3; total time=   0.2s
[CV] END ...................

  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.0; total time=   3.2s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.0; total time=   2.8s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.0; total time=   2.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.0; total time=   2.9s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.0; total time=   2.5s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.2; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.2; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.2; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.2; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.2; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.3; total time=   0.0s
[CV] END ...................

  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=1.0, l1_ratio=0.0; total time=   2.2s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=1.0, l1_ratio=0.0; total time=   1.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=1.0, l1_ratio=0.0; total time=   1.6s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=1.0, l1_ratio=0.0; total time=   1.6s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=1.0, l1_ratio=0.0; total time=   1.8s
[CV] END ............................alpha=1.0, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=1.0, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=1.0, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=1.0, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=1.0, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=1.0, l1_ratio=0.2; total time=   0.0s
[CV] END ............................alpha=1.0, l1_ratio=0.2; total time=   0.0s
[CV] END ............................alpha=1.0, l1_ratio=0.2; total time=   0.0s
[CV] END ............................alpha=1.0, l1_ratio=0.2; total time=   0.0s
[CV] END ............................alpha=1.0, l1_ratio=0.2; total time=   0.0s
[CV] END ............................alpha=1.0, l1_ratio=0.3; total time=   0.0s
[CV] END ...................

  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=10.0, l1_ratio=0.0; total time=   1.6s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=10.0, l1_ratio=0.0; total time=   3.2s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=10.0, l1_ratio=0.0; total time=   3.9s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=10.0, l1_ratio=0.0; total time=   2.8s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=10.0, l1_ratio=0.0; total time=   2.4s
[CV] END ...........................alpha=10.0, l1_ratio=0.1; total time=   0.1s
[CV] END ...........................alpha=10.0, l1_ratio=0.1; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.1; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.1; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.1; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.2; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.2; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.2; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.2; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.2; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.3; total time=   0.0s
[CV] END ...................

In [7]:
model, model_params = finetune(
    X_train_glove, 
    y_train,
    ElasticNet(random_state=42),
    {
        "alpha": [0.01, 0.1, 1.0, 10.0],
        "l1_ratio": [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    },
    "neg_root_mean_squared_error",
    cv=5,
    verbose=2
    )
print("Best Hyperparameters:", model_params)
results["linear regression glove"] = evaluate(X_test_glove, y_test, model)
print("Root Mean Squared Error (RMSE):", results["linear regression glove"])

Fitting 5 folds for each of 44 candidates, totalling 220 fits


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.0; total time=   1.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.0; total time=   1.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.0; total time=   2.2s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.0; total time=   2.2s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.0; total time=   1.9s
[CV] END ...........................alpha=0.01, l1_ratio=0.1; total time=   0.3s
[CV] END ...........................alpha=0.01, l1_ratio=0.1; total time=   0.3s
[CV] END ...........................alpha=0.01, l1_ratio=0.1; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.1; total time=   0.3s
[CV] END ...........................alpha=0.01, l1_ratio=0.1; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.2; total time=   0.2s
[CV] END ...........................alpha=0.01, l1_ratio=0.2; total time=   0.3s
[CV] END ...........................alpha=0.01, l1_ratio=0.2; total time=   0.3s
[CV] END ...........................alpha=0.01, l1_ratio=0.2; total time=   0.3s
[CV] END ...........................alpha=0.01, l1_ratio=0.2; total time=   0.3s
[CV] END ...........................alpha=0.01, l1_ratio=0.3; total time=   0.2s
[CV] END ...................

  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.0; total time=   2.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.0; total time=   0.7s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.0; total time=   0.9s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.0; total time=   1.0s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=0.1, l1_ratio=0.0; total time=   0.9s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.2; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.2; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.2; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.2; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.2; total time=   0.1s
[CV] END ............................alpha=0.1, l1_ratio=0.3; total time=   0.0s
[CV] END ...................

  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=1.0, l1_ratio=0.0; total time=   1.6s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=1.0, l1_ratio=0.0; total time=   1.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=1.0, l1_ratio=0.0; total time=   1.1s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=1.0, l1_ratio=0.0; total time=   1.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END ............................alpha=1.0, l1_ratio=0.0; total time=   1.7s
[CV] END ............................alpha=1.0, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=1.0, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=1.0, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=1.0, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=1.0, l1_ratio=0.1; total time=   0.1s
[CV] END ............................alpha=1.0, l1_ratio=0.2; total time=   0.0s
[CV] END ............................alpha=1.0, l1_ratio=0.2; total time=   0.0s
[CV] END ............................alpha=1.0, l1_ratio=0.2; total time=   0.0s
[CV] END ............................alpha=1.0, l1_ratio=0.2; total time=   0.0s
[CV] END ............................alpha=1.0, l1_ratio=0.2; total time=   0.0s
[CV] END ............................alpha=1.0, l1_ratio=0.3; total time=   0.0s
[CV] END ...................

  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=10.0, l1_ratio=0.0; total time=   1.2s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=10.0, l1_ratio=0.0; total time=   1.6s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=10.0, l1_ratio=0.0; total time=   1.2s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=10.0, l1_ratio=0.0; total time=   1.1s


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=10.0, l1_ratio=0.0; total time=   1.1s
[CV] END ...........................alpha=10.0, l1_ratio=0.1; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.1; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.1; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.1; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.1; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.2; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.2; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.2; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.2; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.2; total time=   0.0s
[CV] END ...........................alpha=10.0, l1_ratio=0.3; total time=   0.0s
[CV] END ...................

## Support Vector Regression

In [8]:
model, model_params = finetune(
    X_train_word2vec, 
    y_train,
    LinearSVR(random_state=42, epsilon=0),
    {
        "loss": ["squared_epsilon_insensitive", "squared_epsilon_sensitive"],
        "intercept_scaling": [0.001, 0.01, 0.1, 1.0, 10.0, 100]
    },
    "neg_root_mean_squared_error",
    cv=5,
    verbose=2
    )
print("Best Hyperparameters:", model_params)
results["linear svm word2vec"] = evaluate(X_test_word2vec, y_test, model)
print("Root Mean Squared Error (RMSE):", results["linear svm word2vec"])

Fitting 5 folds for each of 12 candidates, totalling 60 fits




[CV] END intercept_scaling=0.001, loss=squared_epsilon_insensitive; total time=   0.4s




[CV] END intercept_scaling=0.001, loss=squared_epsilon_insensitive; total time=   0.4s




[CV] END intercept_scaling=0.001, loss=squared_epsilon_insensitive; total time=   0.3s




[CV] END intercept_scaling=0.001, loss=squared_epsilon_insensitive; total time=   0.3s




[CV] END intercept_scaling=0.001, loss=squared_epsilon_insensitive; total time=   0.3s
[CV] END intercept_scaling=0.001, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.001, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.001, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.001, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.001, loss=squared_epsilon_sensitive; total time=   0.0s




[CV] END intercept_scaling=0.01, loss=squared_epsilon_insensitive; total time=   0.4s




[CV] END intercept_scaling=0.01, loss=squared_epsilon_insensitive; total time=   0.4s




[CV] END intercept_scaling=0.01, loss=squared_epsilon_insensitive; total time=   0.4s




[CV] END intercept_scaling=0.01, loss=squared_epsilon_insensitive; total time=   0.3s




[CV] END intercept_scaling=0.01, loss=squared_epsilon_insensitive; total time=   0.4s
[CV] END intercept_scaling=0.01, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.01, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.01, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.01, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.01, loss=squared_epsilon_sensitive; total time=   0.0s




[CV] END intercept_scaling=0.1, loss=squared_epsilon_insensitive; total time=   0.3s




[CV] END intercept_scaling=0.1, loss=squared_epsilon_insensitive; total time=   0.3s




[CV] END intercept_scaling=0.1, loss=squared_epsilon_insensitive; total time=   0.3s




[CV] END intercept_scaling=0.1, loss=squared_epsilon_insensitive; total time=   0.2s




[CV] END intercept_scaling=0.1, loss=squared_epsilon_insensitive; total time=   0.3s
[CV] END intercept_scaling=0.1, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.1, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.1, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.1, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.1, loss=squared_epsilon_sensitive; total time=   0.0s




[CV] END intercept_scaling=1.0, loss=squared_epsilon_insensitive; total time=   0.3s




[CV] END intercept_scaling=1.0, loss=squared_epsilon_insensitive; total time=   0.3s




[CV] END intercept_scaling=1.0, loss=squared_epsilon_insensitive; total time=   0.3s




[CV] END intercept_scaling=1.0, loss=squared_epsilon_insensitive; total time=   0.3s




[CV] END intercept_scaling=1.0, loss=squared_epsilon_insensitive; total time=   0.3s
[CV] END intercept_scaling=1.0, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=1.0, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=1.0, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=1.0, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=1.0, loss=squared_epsilon_sensitive; total time=   0.0s




[CV] END intercept_scaling=10.0, loss=squared_epsilon_insensitive; total time=   1.6s




[CV] END intercept_scaling=10.0, loss=squared_epsilon_insensitive; total time=   1.9s




[CV] END intercept_scaling=10.0, loss=squared_epsilon_insensitive; total time=   1.6s




[CV] END intercept_scaling=10.0, loss=squared_epsilon_insensitive; total time=   1.6s




[CV] END intercept_scaling=10.0, loss=squared_epsilon_insensitive; total time=   1.6s
[CV] END intercept_scaling=10.0, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=10.0, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=10.0, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=10.0, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=10.0, loss=squared_epsilon_sensitive; total time=   0.0s




[CV] END intercept_scaling=100, loss=squared_epsilon_insensitive; total time=   1.6s




[CV] END intercept_scaling=100, loss=squared_epsilon_insensitive; total time=   1.6s




[CV] END intercept_scaling=100, loss=squared_epsilon_insensitive; total time=   1.6s




[CV] END intercept_scaling=100, loss=squared_epsilon_insensitive; total time=   1.6s


30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/var/tmp/xmacko1/bachelor_thesis/.pyenv/versions/3.8.9/envs/ticketing-system/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/var/tmp/xmacko1/bachelor_thesis/.pyenv/versions/3.8.9/envs/ticketing-system/lib/python3.8/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/var/tmp/xmacko1/bachelor_thesis/.pyenv/versions/3.8.9/envs/ticketing-system/lib/python3.8/site-packages/sklearn/base.py", line 637, in _validate_params
    val

[CV] END intercept_scaling=100, loss=squared_epsilon_insensitive; total time=   1.6s
[CV] END intercept_scaling=100, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=100, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=100, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=100, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=100, loss=squared_epsilon_sensitive; total time=   0.0s
Best Hyperparameters: {'intercept_scaling': 100, 'loss': 'squared_epsilon_insensitive'}
Root Mean Squared Error (RMSE): 2579.291465405079




In [9]:
model, model_params = finetune(
    X_train_glove, 
    y_train,
    LinearSVR(random_state=42, epsilon=0),
    {
        "loss": ["squared_epsilon_insensitive", "squared_epsilon_sensitive"],
        "intercept_scaling": [0.001, 0.01, 0.1, 1.0, 10.0, 100]
    },
    "neg_root_mean_squared_error",
    cv=5,
    verbose=2
    )
print("Best Hyperparameters:", model_params)
results["linear svm glove"] = evaluate(X_test_glove, y_test, model)
print("Root Mean Squared Error (RMSE):", results["linear svm glove"])



Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END intercept_scaling=0.001, loss=squared_epsilon_insensitive; total time=   0.6s




[CV] END intercept_scaling=0.001, loss=squared_epsilon_insensitive; total time=   0.6s




[CV] END intercept_scaling=0.001, loss=squared_epsilon_insensitive; total time=   0.6s




[CV] END intercept_scaling=0.001, loss=squared_epsilon_insensitive; total time=   0.8s




[CV] END intercept_scaling=0.001, loss=squared_epsilon_insensitive; total time=   0.9s
[CV] END intercept_scaling=0.001, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.001, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.001, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.001, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.001, loss=squared_epsilon_sensitive; total time=   0.0s




[CV] END intercept_scaling=0.01, loss=squared_epsilon_insensitive; total time=   0.6s




[CV] END intercept_scaling=0.01, loss=squared_epsilon_insensitive; total time=   0.6s




[CV] END intercept_scaling=0.01, loss=squared_epsilon_insensitive; total time=   0.6s




[CV] END intercept_scaling=0.01, loss=squared_epsilon_insensitive; total time=   0.6s




[CV] END intercept_scaling=0.01, loss=squared_epsilon_insensitive; total time=   0.9s
[CV] END intercept_scaling=0.01, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.01, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.01, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.01, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.01, loss=squared_epsilon_sensitive; total time=   0.0s




[CV] END intercept_scaling=0.1, loss=squared_epsilon_insensitive; total time=   0.8s




[CV] END intercept_scaling=0.1, loss=squared_epsilon_insensitive; total time=   0.7s




[CV] END intercept_scaling=0.1, loss=squared_epsilon_insensitive; total time=   0.8s




[CV] END intercept_scaling=0.1, loss=squared_epsilon_insensitive; total time=   0.8s




[CV] END intercept_scaling=0.1, loss=squared_epsilon_insensitive; total time=   0.9s
[CV] END intercept_scaling=0.1, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.1, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.1, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.1, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=0.1, loss=squared_epsilon_sensitive; total time=   0.0s




[CV] END intercept_scaling=1.0, loss=squared_epsilon_insensitive; total time=   0.7s




[CV] END intercept_scaling=1.0, loss=squared_epsilon_insensitive; total time=   0.7s




[CV] END intercept_scaling=1.0, loss=squared_epsilon_insensitive; total time=   0.8s




[CV] END intercept_scaling=1.0, loss=squared_epsilon_insensitive; total time=   0.8s




[CV] END intercept_scaling=1.0, loss=squared_epsilon_insensitive; total time=   0.9s
[CV] END intercept_scaling=1.0, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=1.0, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=1.0, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=1.0, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=1.0, loss=squared_epsilon_sensitive; total time=   0.0s




[CV] END intercept_scaling=10.0, loss=squared_epsilon_insensitive; total time=   2.4s




[CV] END intercept_scaling=10.0, loss=squared_epsilon_insensitive; total time=   2.4s




[CV] END intercept_scaling=10.0, loss=squared_epsilon_insensitive; total time=   2.4s




[CV] END intercept_scaling=10.0, loss=squared_epsilon_insensitive; total time=   2.4s




[CV] END intercept_scaling=10.0, loss=squared_epsilon_insensitive; total time=   2.4s
[CV] END intercept_scaling=10.0, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=10.0, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=10.0, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=10.0, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=10.0, loss=squared_epsilon_sensitive; total time=   0.0s




[CV] END intercept_scaling=100, loss=squared_epsilon_insensitive; total time=   2.5s




[CV] END intercept_scaling=100, loss=squared_epsilon_insensitive; total time=   2.5s




[CV] END intercept_scaling=100, loss=squared_epsilon_insensitive; total time=   2.5s




[CV] END intercept_scaling=100, loss=squared_epsilon_insensitive; total time=   2.5s


30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/var/tmp/xmacko1/bachelor_thesis/.pyenv/versions/3.8.9/envs/ticketing-system/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/var/tmp/xmacko1/bachelor_thesis/.pyenv/versions/3.8.9/envs/ticketing-system/lib/python3.8/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/var/tmp/xmacko1/bachelor_thesis/.pyenv/versions/3.8.9/envs/ticketing-system/lib/python3.8/site-packages/sklearn/base.py", line 637, in _validate_params
    val

[CV] END intercept_scaling=100, loss=squared_epsilon_insensitive; total time=   2.5s
[CV] END intercept_scaling=100, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=100, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=100, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=100, loss=squared_epsilon_sensitive; total time=   0.0s
[CV] END intercept_scaling=100, loss=squared_epsilon_sensitive; total time=   0.0s
Best Hyperparameters: {'intercept_scaling': 1.0, 'loss': 'squared_epsilon_insensitive'}
Root Mean Squared Error (RMSE): 2323.8339795129177


## Random Forest Regression

In [10]:
model, model_params = finetune(
    X_train_word2vec, 
    y_train,
    RandomForestRegressor(random_state=42, n_jobs=-1),
    {
        "min_samples_split": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
        "min_samples_leaf": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
        "max_samples": [0.6, 0.7, 0.8, 0.9, 1.0]
    },
    "neg_root_mean_squared_error",
    cv=5,
    verbose=2
    )
print("Best Hyperparameters:", model_params)
results["random forest word2vec"] = evaluate(X_test_word2vec, y_test, model)
print("Root Mean Squared Error (RMSE):", results["random forest word2vec"])

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=2; total time=   3.2s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=2; total time=   2.8s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=2; total time=   2.6s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=2; total time=   2.6s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=2; total time=   2.6s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=4; total time=   2.8s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=4; total time=   2.8s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=4; total time=   2.6s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=4; total time=   3.0s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=4; total time=   2.6s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=6; total tim

In [11]:
model, model_params = finetune(
    X_train_glove, 
    y_train,
    RandomForestRegressor(random_state=42, n_jobs=-1),
    {
        "min_samples_split": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
        "min_samples_leaf": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
        "max_samples": [0.6, 0.7, 0.8, 0.9, 1.0]
    },
    "neg_root_mean_squared_error",
    cv=5,
    verbose=2
    )
print("Best Hyperparameters:", model_params)
results["random forest glove"] = evaluate(X_test_glove, y_test, model)
print("Root Mean Squared Error (RMSE):", results["random forest glove"])

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=2; total time=   1.6s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=2; total time=   2.1s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=2; total time=   1.9s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=2; total time=   1.8s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=2; total time=   1.7s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=4; total time=   1.8s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=4; total time=   1.7s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=4; total time=   1.8s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=4; total time=   2.0s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=4; total time=   1.8s
[CV] END max_samples=0.6, min_samples_leaf=2, min_samples_split=6; total tim

## Evaluation

In [12]:
pd.DataFrame.from_dict(results, orient="index", columns=["rmse"])

Unnamed: 0,rmse
dummy mean,2375.462056
dummy median,2474.280056
linear regression word2vec,2257.101698
linear regression glove,2252.991318
linear svm word2vec,2579.291465
linear svm glove,2323.83398
random forest word2vec,2322.086202
random forest glove,2311.178355
