## Importy

In [4]:
import json
import numpy as np

from sklearn.model_selection import train_test_split

from Trail import Trail
from peak_computations import finished_cipher, get_team_time
from ImageModel import ImageModel
from HintModel import HintModel

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

from keras.models import Model

import os
from PIL import Image
tf.config.set_visible_devices([], 'GPU')

DATA_DIR='resized_images'
TRAIN_TEST_DIR = 'output'
LABELS=['easy', 'hard']
IMG_SIZE = 512

SAVED_DATA='saved/'

## Predikce nápověd

### Načtení dat o hrách
Toto jsou soubory, které pomáhají mapovat jméno obrázku na konkrétní hru a podobně.

In [5]:
with open("trail_dict_shortcuts.json") as f:
    trail_dict=json.load(f)
my_trails=dict()

In [6]:
with open("trail_dict.json") as f:
    trail_dict_nums=json.load(f)

### Funkce pro přípravu dat

In [7]:
def get_layer_output_from_img(image, intermediate_layer_model):
 
    images_list = []
    images_list.append(np.array(image))
    x = np.asarray(images_list)
    im_model.model.predict(x).round()
    
    intermediate_output = intermediate_layer_model.predict(x)[0]
    return intermediate_output

In [8]:
# z názvu souboru získá jméno hry, aby bylo možné k němu dostat ostatní data
def get_trail_from_img_name(img_name):
    my_split=img_name.split("_")
    trail_id=my_split[0]    
    if trail_id not in trail_dict.keys():
        return False, False, False
        
    # urci co je to za trail
    idx=list(trail_dict_nums.values()).index(trail_dict[trail_id])
    my_split1 = my_split[1].split(".")
    task_no = int(my_split1[0])
    if trail_id in my_trails:
        trail = my_trails[trail_id]
    else:    
        trail_name=trail_dict[trail_id]
        trail = Trail(trail_name)
        print(trail.name)
        my_trails[trail_id]=trail
        
    return trail_id, task_no, idx

In [9]:
# kolik nápověd si tým zatím vzal/kolik si mohl vzít
def get_hint_sum(df, cipher_no, slug_to_assign, finished):    
    my_arr=[]
    for team in finished:
        team_data = df[df['user_id']==team]
        team_num=0
        cnt=0
        for t in slug_to_assign:
            my_t=team_data[(team_data['type']=='TASK_HINT')& (team_data['task_id']==slug_to_assign[t])]
            if t<cipher_no:
                cnt+=2
                team_num+=len(my_t)
        if(cnt>0):
            res = team_num/cnt
        else:
            res=team_num
        my_arr.append(res)
    return my_arr

In [10]:
# vrací pole 1/0 pro každý z týmů, který vyřešil šifru, vzali/nevzali
# toto je predikovaná veličina
def get_hint_taking(df, cipher_no, slug_to_assign, finished):
    my_arr=[]
    for team in finished:
        team_task = df[df['user_id']==team]
        hint=team_task[team_task['type']=='TASK_HINT']
        if hint.shape[0]:
            my_arr.append(1)
        else:
            my_arr.append(0)
    return my_arr

In [11]:
# průměrně strávený čas na předchozích šifrách
def get_time_sum(df, cipher_no, slug_to_assign, finished):
    my_arr=[]
    for team in finished:
        team_num=0
        cnt=0
        for t in slug_to_assign:
            my_t=get_team_time(team, slug_to_assign[t], df)
            if t<cipher_no:
                cnt+=1
                team_num+=my_t
        if(cnt>0):
            res = team_num/cnt
        else:
            res=team_num
        my_arr.append(res)
    return my_arr

### Příprava dat

Pro každou šifru, pro kterou jsou k dispozici obrazová i týmová data vezmeme všechny týmy, které ji dokončily a vytvoříme z nich vstupní data sítě.
Data jsou převedena do numpy.arrays a rozdělena na trénovací a testovací sadu.

Datasety lze buď znovu vytvořit, nebo načíst z uložených.

In [12]:
def create_datasets():
    d_st=[]
    d_h=[]
    d_lbl=[]
    d_task=[]
    d_img=[]
    d_trl=[]
    d_lay_o=[]

    # načtení modelu pro klasifikaci obrázků a extrakce předposlední vrstvy, jejíž výstup chceme použít 
    im_model=ImageModel(LABELS, IMG_SIZE)
    im_model.load_weights()

    layer_names=[layer.name for layer in im_model.model.layers]
    intermediate_layer_model = Model(inputs=im_model.model.input,
                                     outputs=im_model.model.get_layer(layer_names[-2]).output)

    for label in range(len(LABELS)):
        for image in os.listdir(DATA_DIR+'/'+LABELS[label]):
            trail_id, task_no, trail_idx = get_trail_from_img_name(image)
            if not trail_id:
                continue
            t = my_trails[trail_id]
            df = t.df
            finished = finished_cipher(task_no, df, t.slug_to_assign)

            stats=get_hint_sum(df, task_no, t.slug_to_assign, finished)
            hints=get_hint_taking(df, task_no, t.slug_to_assign, finished)
            times=get_time_sum(df, task_no, t.slug_to_assign, finished)

            d_st.extend(zip(stats, times))
            d_h.extend(hints)
            d_lbl.extend([label]*len(stats))
            d_task.extend([task_no]*len(stats))
            d_trl.extend([trail_idx]*len(stats))
            d_img.extend([Image.open(DATA_DIR+'/'+LABELS[label]+"/"+image).convert("RGB")]*len(stats))

            d_lay_o.extend([get_layer_output_from_img(d_img[-1], intermediate_layer_model)]*len(stats))

    n_st=np.array(d_st)
    n_trl=np.array(d_trl)
    n_lbl=np.array(d_lbl)
    n_task=np.array(d_task)
    n_lay_o=np.array(d_lay_o)
    n_h=np.array(d_h)
    
    return n_st, n_trl, n_lbl, n_task, n_lay_o, n_h

In [13]:
def load_datasets():
    n_st=np.load(SAVED_DATA+"stats.npy")
    n_trl=np.load(SAVED_DATA+"trail.npy")
    n_lbl=np.load(SAVED_DATA+"labels.npy")
    n_task=np.load(SAVED_DATA+"task.npy")
    n_lay_o=np.load(SAVED_DATA+"layer.npy")
    n_h=np.load(SAVED_DATA+"hints.npy") 
    
    return n_st, n_trl, n_lbl, n_task, n_lay_o, n_h

In [14]:
n_st, n_trl, n_lbl, n_task, n_lay_o, n_h=load_datasets()

### Rozdělení dat a převod na tensory

In [15]:
tr_st, val_st, \
tr_trl, val_trl, \
tr_lbl, val_lbl, \
tr_task, val_task, \
tr_lay, val_lay, \
tr_h, val_h = train_test_split(n_st, n_trl, n_lbl, n_task, n_lay_o, n_h, train_size=0.9, random_state=42)


tr_st=tf.convert_to_tensor(tr_st)
val_st=tf.convert_to_tensor(val_st)

tr_trl=tf.one_hot(tr_trl,tr_trl.max()+1)
val_trl=tf.one_hot(val_trl,val_trl.max()+1)

tr_lbl=tf.one_hot(tr_lbl,2)
val_lbl=tf.one_hot(val_lbl,2)

tr_task=tf.one_hot(tr_task,tr_task.max())
val_task=tf.one_hot(val_task,val_task.max())

tr_lay=tf.convert_to_tensor(tr_lay)
val_lay=tf.convert_to_tensor(val_lay)

In [16]:
x_train=[tr_st, tr_lbl, tr_task, tr_trl, tr_lay]
y_train=tr_h

x_val=[val_st, val_lbl, val_task, val_trl, val_lay]
y_val=val_h

### Model

In [17]:
neg, pos = np.bincount(tr_h)
total = neg + pos
print('Training examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))
initial_bias = np.log([pos/neg])

neg, pos = np.bincount(val_h)
total = neg + pos
print('Validation examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

Training examples:
    Total: 39613
    Positive: 27216 (68.70% of total)

Validation examples:
    Total: 4402
    Positive: 3048 (69.24% of total)



## Evaluace modelu

In [18]:
my_model=HintModel(initial_bias)
my_model.set_checkpoint_path("hint_prediction_weights/cp.ckpt")
my_model.load_weights()
my_model.model.evaluate(x_val, y_val, verbose=2)

138/138 - 0s - loss: 0.5878 - accuracy: 0.7117 - 191ms/epoch - 1ms/step


2022-06-21 18:19:14.895948: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


[0.5878480672836304, 0.7117219567298889]

In [19]:
my_model.model.evaluate(x_train, y_train, verbose=2)

1238/1238 - 0s - loss: 0.5975 - accuracy: 0.7042 - 439ms/epoch - 355us/step


[0.5974576473236084, 0.7041627764701843]

## Uložení datasetů

In [41]:
if not os.path.exists(SAVED_DATA):
    os.makedirs(SAVED_DATA) 

np.save(SAVED_DATA+"stats.npy", n_st)
np.save(SAVED_DATA+"trail.npy", n_trl)
np.save(SAVED_DATA+"labels.npy", n_lbl)
np.save(SAVED_DATA+"task.npy", n_task)
np.save(SAVED_DATA+"layer.npy", n_lay_o)
np.save(SAVED_DATA+"hints.npy", n_h)