In [None]:
import pandas as pd
import re

## Helper functions for data extraction, pandas joins and etc.

In [None]:
def get_task_dir_name(target_file):
    return target_file.split(".")[0]

def convert_to_python_filename(task_name):
    snake_case = "_".join(re.split(r'[^a-zA-Z0-9]+', task_name.lower()))
    return snake_case + ".py"

def get_llm_model_name(filename):
    for model_name in ["deepseek_coder", "gpt_4o_2024_08_06", "gemini_1_5_pro"]:
        if model_name in filename:
            return model_name

## Load all data sources

In [None]:
adequacy_stats = "../reference_data/combined_stats.csv"
attribute_stats = "../reference_data/filtered_Python.csv"
mutation_stats = "./per_suite.csv"

df_suite = pd.read_csv(mutation_stats)
python_data = pd.read_csv(attribute_stats)
classic_adequacy_df_results = pd.read_csv(adequacy_stats)

In [None]:
df_suite.describe()

In [None]:
df_suite[ df_suite.all_mutants - df_suite.incompetent == 0 ]

In [None]:
df_suite[ (df_suite.killed != 0) | df_suite.survived != 0 ]

In [None]:
per_test = pd.read_csv("per_test.csv")
per_test

## Preprocess data

In [None]:
python_data["task_name"] = python_data.task_name.apply(lambda x: get_task_dir_name(convert_to_python_filename(x)))
df_suite["task_name"] = df_suite.target_file.apply(get_task_dir_name)
df_suite["llm_model"] = df_suite.test_module_name.apply(lambda x: get_llm_model_name(x))

classic_adequacy_df_results.task_name = classic_adequacy_df_results.task_name.apply(lambda x: get_task_dir_name(convert_to_python_filename(x)))
is_python = classic_adequacy_df_results.language_name == "Python"
python_classic_adequacy_df_results = classic_adequacy_df_results[ is_python ]

## Merge into common metrics dataframe

In order to answer research questions, we need to join our computed mutpy runs data
with previous statistical data of classic adequacy testing metrics and code attributes,
so that we get data entries:
    - for every test file (row): MT metrics, Classic metrics, Code attributes 

In [None]:
benchmark_df = df_suite.copy()
benchmark_df["target_code_length"] = None
benchmark_df["target_line_count"] = None


valuable_columns = [
    "line_coverage_score",
    "branch_coverage_score",
    "assertions_mccabe_ratio_score",
    "assertions_density_score",
    "warnings_count_score"
]

# Merge them into X based on 'id'
benchmark_df = benchmark_df.merge(python_classic_adequacy_df_results, on=["llm_model","task_name"], how='left')
overall_df = benchmark_df.merge(python_data, on=["task_name"], how='left')
overall_df.llm_model = overall_df.llm_model.astype('category')

overall_df

In [None]:
overall_df.to_csv("output/combined_test_metrics.csv")

In [None]:

selected_columns = ["tests_passed", "tests_failed", "mutation_score", "rapfd_score", "random_rapfd_score", "branch_coverage_score", "assertions_density_score", "warnings_count_score", "assertions_mccabe_ratio_score", "line_coverage_score", "code_length", "line_count"]
df = overall_df[selected_columns]
spearman_corr = df.corr(method="spearman")
print(spearman_corr)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr

# Compute Spearman correlation
spearman_corr = df.corr(method="spearman")

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(spearman_corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Spearman Correlation Heatmap")
plt.show()

In [None]:
df_clean = df.dropna()
for metric in selected_columns:
    rho, pval = spearmanr(df_clean['mutation_score'], df_clean[metric])
    print(f"MutationScore-{metric} Spearman's rho: {rho:.3f}, p-value: {pval:.4f} ({pval})")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
import numpy as np

# Step 1: Compute correlation and p-value matrices
cols = df_clean.columns
n = len(cols)
corr_matrix = pd.DataFrame(np.zeros((n, n)), columns=cols, index=cols)
pval_matrix = pd.DataFrame(np.zeros((n, n)), columns=cols, index=cols)

for i in range(n):
    for j in range(n):
        col1, col2 = cols[i], cols[j]
        rho, pval = spearmanr(df_clean[col1], df_clean[col2])
        corr_matrix.iloc[i, j] = rho
        pval_matrix.iloc[i, j] = pval

# Step 2: Create custom annotation and text color mask
annot = corr_matrix.copy().astype(str)
text_colors = np.full((n, n), "black")  # default: black

for i in range(n):
    for j in range(n):
        rho = corr_matrix.iloc[i, j]
        pval = pval_matrix.iloc[i, j]
        annot.iloc[i, j] = f"{rho:.2f}"
        if pval > 0.05:
            text_colors[i, j] = "grey"  # grey out non-significant

# Step 3: Plot with seaborn and custom text colors
plt.figure(figsize=(10, 8))
sns.heatmap(
    corr_matrix,
    annot=annot,
    fmt="",
    cmap="coolwarm_r",
    cbar_kws={'label': "Spearman ρ"},
    annot_kws={"size": 10},
)

# Manually set text colors (requires matplotlib access to the plot)
ax = plt.gca()
for t, color in zip(ax.texts, text_colors.flatten()):
    t.set_color(color)

plt.title("Spearman Correlation (Gray = Not Significant, p > 0.05)")
plt.tight_layout()
plt.savefig("image/correlation_heatmap_with_pval", bbox_inches='tight')
plt.show()


In [None]:
spearman_corr_clean = df_clean.corr(method="spearman")

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(spearman_corr_clean, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Spearman Correlation Heatmap")

plt.savefig("image/correlation_heatmap", bbox_inches='tight')

plt.show()

In [None]:
# Create a pairplot
sns.pairplot(df_clean)

# Show the plot
plt.show()

In [None]:
sns.histplot(df['mutation_score'], bins=11, kde=True)
plt.title('Mutation Score Distribution with KDE')
plt.xlabel('mutation score')
plt.ylabel('density')
plt.show()

In [None]:
sns.histplot(df['rapfd_score'], bins=11, label="Real RAPFD", kde=True)
sns.histplot(df['random_rapfd_score'], label="Random RAPFD", bins=11, kde=True)


plt.title('RAPFD KDE Distribution w/ Bins comparisson ')
plt.xlabel('mutation score')
plt.ylabel('density')
plt.legend()

plt.show()

In [None]:
sns.kdeplot(df['rapfd_score'], label='RAPFD', linewidth=2)
sns.kdeplot(df['random_rapfd_score'], label='Random RAPFD', linewidth=1)

plt.title('RAPFD Distributions compared')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.show()

# Initial look at the data

Let's look initially on the pairwise scatter plot of all data columns.
Although this will be too large for detailed analysis, issues can be spotted 

In [None]:
# Create a pairplot
sns.pairplot(df)

# Show the plot
plt.show()

In [None]:
sns.pairplot(df, vars=['mutation_score', 'rapfd_score'])

plt.show()

In [None]:
sns.kdeplot(per_test['per_test_score'], label='Random RAPFD', linewidth=1)

In [None]:
sns.histplot(per_test['per_test_score'], label="Random RAPFD", bins=11, kde=True)


In [None]:
sns.kdeplot(per_test['per_test_score'], label='Random RAPFD', linewidth=1)

In [None]:
overall_df.groupby("llm_model")["mutation_score"].mean()

In [None]:
overall_df.groupby("llm_model")["rapfd_score"].mean()

In [None]:
overall_df.groupby("llm_model")["random_rapfd_score"].mean()

In [None]:
# sns.pairplot(overall_df, vars= hue="llm_model")

# plt.show()

In [None]:
sns.pairplot(overall_df, vars=selected_columns, hue="llm_model")

plt.show()

In [None]:
for model, group in overall_df.groupby('llm_model'):
    sns.kdeplot(group['mutation_score'], label=model)

plt.xlabel('Mutation Score')
plt.ylabel('Density')
plt.title('KDE of Mutation Scores by Model')
plt.legend()
plt.show()

In [None]:
colors = ["blue", "orange", "green"]
for (model, group),color in zip(overall_df.groupby('llm_model'),colors):
    sns.kdeplot(group['rapfd_score'], label=model+"_rapfd", color=color)
    sns.kdeplot(group['random_rapfd_score'], label=model+"_rapfd_random", linestyle="--", color=color)


plt.xlabel('RAPFD Score')
plt.ylabel('Density')
plt.title('KDE of RAPFD Scores (original+randomized) by Model')
plt.legend()

plt.savefig("image/rapfd_comparisson_boxplot", bbox_inches='tight')

plt.show()

In [None]:
overall_df.groupby('llm_model').agg({"timeout":"sum","incompetent":"sum","survived":"sum","killed":"sum"}).sort_values(by='killed', ascending=False).plot.bar()
# df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})


    # .reset_index(name='count').pivot_table(index='llm_model', columns='killed', values='count').plot(kind='bar', stacked=True,)


In [None]:
overall_df.groupby('llm_model').agg({"all_mutants":"sum"}).plot.bar()


In [None]:
overall_df.groupby('llm_model').agg({"timeout":"sum","incompetent":"sum","killed":"sum", "survived":"sum", "all_mutants":"sum"}).sort_values(by='all_mutants', ascending=False).drop(columns='all_mutants').plot(kind="bar", stacked=True)


In [None]:
import matplotlib.pyplot as plt

# Step 1: Aggregate the data
counts_df = (
    overall_df.groupby('llm_model')
    .agg({"timeout":"sum", "incompetent":"sum", "killed":"sum", "survived":"sum", "all_mutants":"sum"})
    .sort_values(by='all_mutants', ascending=False)
)

# Step 2: Drop total column and calculate proportions (ratios)
ratios_df = counts_df.drop(columns='all_mutants')
ratios_normalized = ratios_df.div(ratios_df.sum(axis=1), axis=0)

# Define custom colors
custom_colors = {
    "killed": "green",         # forced green
    "survived": "red",         # forced red
    "timeout": "lightblue",    # light blue
    "incompetent": "lightgrey" # light grey
}

# Create color list in column order
color_list = [
    custom_colors[col] if custom_colors[col] is not None else None
    for col in ratios_normalized.columns
]

fig, axes = plt.subplots(ncols=2, figsize=(10, 4))  # You can adjust figsize

overall_df.groupby('llm_model').agg({"timeout":"sum","incompetent":"sum","killed":"sum", "survived":"sum", "all_mutants":"sum"}).sort_values(by='all_mutants', ascending=False).drop(columns='all_mutants').plot(kind="bar", ax=axes[0], stacked=True, rot=30, color=color_list).legend().remove()



# Step 3: Plot horizontal stacked bar
ax = ratios_normalized.plot(kind='barh', ax=axes[1], rot=60, stacked=True, figsize=(10, 6), color=color_list)
ax.legend(
    title="Outcome",
    bbox_to_anchor=(1.05, 1),   # X > 1 pushes it outside the axis box
    loc='upper left',
    borderaxespad=0.
)

# Optional: add percentages inside bars
for i, row in enumerate(ratios_normalized.values):
    cum_width = 0
    for j, val in enumerate(row):
        if val > 0.03:  # skip tiny segments
            ax.text(cum_width + val / 2, i, f"{val:.0%}", ha='center', va='center', fontsize=8)
        cum_width += val

# Labels and layout
plt.xlabel("Proportion")
plt.ylabel("LLM Model")
plt.tight_layout()

plt.savefig("image/mutants_stats", bbox_inches='tight')

plt.show()


In [None]:
overall_df

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(10, 4))  # You can adjust figsize

stats_df = overall_df.groupby('llm_model').agg({"tests_passed":"sum","tests_failed":"sum"})
ax = stats_df.plot(kind='bar', ax=axes[0], rot=30)

for container in ax.containers:
    ax.bar_label(container)

ax = stats_df.plot(kind='barh', ax=axes[1], stacked=True, rot=80)
ax.bar_label(ax.containers[1])
plt.savefig("image/test_passed_failed_overal_per_llm", bbox_inches='tight')

In [None]:
# .agg({"tests_passed":"sum","tests_failed":"sum"}).sort_values(by='tests_passed', ascending=False).plot(kind='bar', stacked=True,)
sns.violinplot(data=overall_df, x="mutation_score", y="llm_model")

In [None]:
# sns.boxplot(data=overall_df, x="mutation_score", y="llm_model")
# plt.savefig("image/mutation_score_per_llm_boxplot", bbox_inches='tight')

# Create boxplot
ax = sns.boxplot(data=overall_df, x="mutation_score", y="llm_model")

# Compute medians
medians = overall_df.groupby('llm_model')['mutation_score'].median()

# Annotate each box with the median value
for i, (model, median_val) in enumerate(medians.items()):
    ax.text(
        median_val + 0.01, i,  # small offset to the right
        f"{median_val:.2f}",
        va='center', ha='left', color='black', fontsize=9
    )

plt.title("Mutation Score per Model (with Median)")
plt.tight_layout()
plt.savefig("image/mutation_score_per_llm_boxplot", bbox_inches='tight')
plt.show()

In [None]:
sns.boxplot(data=overall_df[ overall_df.llm_model=="deepseek_coder" ], x="random_rapfd_score")

In [None]:
sns.boxplot(data=overall_df[ overall_df.llm_model=="deepseek_coder" ], x="rapfd_score")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Melt both metrics for all models
rapfd_melted = overall_df.melt(
    id_vars='llm_model',
    value_vars=['rapfd_score', 'random_rapfd_score'],
    var_name='Metric',
    value_name='Score'
)

# Define explicit order
model_order = rapfd_melted['llm_model'].unique()
hue_order = ['rapfd_score', 'random_rapfd_score']

# Create boxplot with defined order
ax = sns.boxplot(
    data=rapfd_melted,
    x='llm_model',
    y='Score',
    hue='Metric',
    order=model_order,
    hue_order=hue_order
)

# Compute medians
group_medians = rapfd_melted.groupby(['llm_model', 'Metric'])['Score'].median()

# Annotate each box with its median
for i, model in enumerate(model_order):
    for j, metric in enumerate(hue_order):
        median_val = group_medians[model, metric]
        # offset = width of bars in boxplot is 0.8 by default
        x_pos = i - 0.2 + 0.4 * j  # Adjust for side-by-side hue boxes
        ax.text(
            x_pos,
            median_val,
            f"{median_val:.2f}",
            ha='center',
            va='bottom',
            fontsize=8,
            color='black'
        )

# Final layout
plt.title("RAPFD Score Comparison (Original vs Random) for Each Model")
plt.xticks(rotation=0)
plt.legend(
    title='Metric',
    loc='upper right',
    ncol=2,
)
plt.tight_layout()
plt.savefig("image/rapfd_comparisson_boxplot", bbox_inches='tight')
plt.show()


In [None]:
for model in overall_df.llm_model.cat.categories:
    sns.boxplot(data=overall_df[ overall_df.llm_model==model ], x="rapfd_score", y="llm_model")

# sns.boxplot(data=overall_df, x="mutation_score", y="llm_model")
# plt.savefig("image/mutation_score_per_llm_boxplot", bbox_inches='tight')


In [None]:
import matplotlib.pyplot as plt

# Assuming df has columns: ['file_id', 'rapfd_score', 'random_rapfd_score']
df_sorted = overall_df[ overall_df.llm_model=="deepseek_coder" ].sort_values('rapfd_score')  # Sort for visual clarity

plt.plot(df_sorted['rapfd_score'].values, label='RAPFD', marker='o')
plt.plot(df_sorted['random_rapfd_score'].values, label='Random RAPFD', marker='x')

# Calculate differences
import numpy as np
diff = df_sorted["random_rapfd_score"] - df_sorted["rapfd_score"]  # positive = above, negative = below

# Compute weighted totals
total_above = np.sum(diff[diff > 0])  # sum of positive differences
total_below = np.abs(np.sum(diff[diff < 0]))  # sum of negative differences (as positive value)

# Optional: also compute proportions
total_weight = np.sum(np.abs(diff))
prop_above = total_above / total_weight if total_weight != 0 else 0
prop_below = total_below / total_weight if total_weight != 0 else 0
summary = (
    f"Proportion above: {prop_above:.2%}\n"
    f"Proportion below: {prop_below:.2%}"
)

plt.xlabel("File index (sorted by RAPFD)")
plt.ylabel("Score")
plt.title("RAPFD vs Randomized RAPFD per File")
plt.legend(title=f"{summary}")
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np

# Get exactly 3 models
models = overall_df.llm_model.unique()
assert len(models) == 3, "This layout works for exactly 3 models."

# Create figure with 2 rows, 2 columns
fig = plt.figure(figsize=(10, 6))
gs = gridspec.GridSpec(2, 2)

# Top row: 2 side-by-side plots
ax1 = fig.add_subplot(gs[0, 0])
ax2 = fig.add_subplot(gs[0, 1])

# Bottom row: center the third plot by placing it in a new GridSpec below the first two
# Trick: use subgridspec for a centered axis in the second row
gs_bottom = gs[1, :].subgridspec(1, 3)
ax3 = fig.add_subplot(gs_bottom[0, 1:])  # span middle and right column

axes = [ax1, ax2, ax3]

# Plot logic for each model
for ax, model in zip(axes, models):
    df_sorted = overall_df[overall_df.llm_model == model].sort_values('rapfd_score')

    rapfd = df_sorted['rapfd_score'].values
    random_rapfd = df_sorted['random_rapfd_score'].values

    # Plot
    ax.plot(rapfd, label='RAPFD', marker='o', alpha=0.6)
    ax.plot(random_rapfd, label='Random RAPFD', marker='x', alpha=0.6)

    # Compute stats
    diff = df_sorted["random_rapfd_score"] - df_sorted["rapfd_score"]  # positive = above, negative = below

    # Compute weighted totals
    total_above = np.sum(diff[diff > 0])  # sum of positive differences
    total_below = np.abs(np.sum(diff[diff < 0]))  # sum of negative differences (as positive value)
    
    # Optional: also compute proportions
    total_weight = np.sum(np.abs(diff))
    prop_above = total_above / total_weight if total_weight != 0 else 0
    prop_below = total_below / total_weight if total_weight != 0 else 0
    summary = (
        f"Proportion above: {prop_above:.2%}\n"
        f"Proportion below: {prop_below:.2%}"
    )

    ax.set_title(model)
    ax.set_xlabel("File index (sorted by RAPFD)")
    ax.set_ylabel("Score")
    ax.legend(title=summary)

# Layout and title
plt.suptitle("RAPFD vs Randomized RAPFD per File for Each Model", fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.savefig("image/rapfd_vs_random_triangle", bbox_inches='tight')
plt.show()


In [None]:
per_test = pd.read_csv("per_test.csv")
per_test.is_failed = per_test.is_failed.astype(int)
per_test

In [None]:
from scipy.stats import mannwhitneyu

# Split the data based on the binary column 'x'
group0 = per_test[per_test['is_failed'] == 0]['per_test_score'].dropna()
group1 = per_test[per_test['is_failed'] == 1]['per_test_score'].dropna()

# Perform the Mann–Whitney U test
stat, p_value = mannwhitneyu(group0, group1, alternative='two-sided')

print(f"Mann–Whitney U statistic: {stat:.3f}")
print(f"p-value: {p_value:.5f}")

In [None]:
failed_unit_tests = per_test[per_test['is_failed'] == 1]

# Plot histogram
plt.figure(figsize=(8, 5))
ax = sns.histplot(failed_unit_tests["per_test_score"], bins=10)

# Add bin labels on top of each bar
for patch in ax.patches:
    height = patch.get_height()
    if height > 0:  # Skip empty bins
        plt.text(patch.get_x() + patch.get_width() / 2,
                 height,
                 f'{height:.0f}',  # or f'{height:.2f}' for decimals
                 ha='center',
                 va='bottom',
                 fontsize=9)

plt.title('Mutation score for failed unit tests')
plt.xlabel('mutation score')
plt.ylabel('density' if ax.get_ylabel() == 'Density' else 'count')
plt.savefig("image/failed_unit_tests_mutation_score_binned", bbox_inches='tight')
plt.show()

## Experimental continuation - PCA, feature selection, ...
### PCA and Feature Selection

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

feature_df = overall_df.drop(overall_df.columns[overall_df.isna().all()],axis=1).dropna()

classic_metrics = [
    "branch_coverage_score",
    "assertions_mccabe_ratio_score",
]

new_metrics = classic_metrics + ["mutation_score"]

X = StandardScaler().fit_transform(feature_df[new_metrics])

print([(idx,col) for idx,col in enumerate(new_metrics)])

pca = PCA(n_components=len(new_metrics))
pca.fit(X)

print([(idx,float(var)) for idx,var in enumerate(pca.explained_variance_ratio_)])

In [None]:
from sklearn.feature_selection import mutual_info_regression

X = feature_df[classic_metrics]
y = feature_df['mutation_score']
mi = mutual_info_regression(X, y)
print(pd.Series(mi, index=X.columns))