In [1]:
import json
import splitfolders

from PIL import Image
from pdf2image import convert_from_path
import shutil
import pathlib

import os
from os import listdir

from Trail import Trail

# root adresář s daty
KNIHOVNIK='./'

# sem se ukladaji resizovane obrazky
DATA_DIR='resized_images'

# sem tran-test split rozhodi trenovaci a testovaci sadu
TRAIN_TEST_DIR = 'output'

# sifry vytahane z databanky rucne rozhozene na easy/hard
# tato data nejsou k projektu přiložena
COURSES_DIR = KNIHOVNIK+'sifry_z_databanky'

# sifry vytahane z rocniku DNEM rucne rozhozene na easy/hard
DNEM_DIR = KNIHOVNIK+'DNEM'

# tato data nejsou k projektu přiložena
TRAILS_DIR=KNIHOVNIK+'trails'

# vsechny sifry v original velikosti pro lepsi extrakci textu
ORIGINALS_DIR=KNIHOVNIK+'images'

LABELS=['easy', 'hard']

IMG_SIZE=512

In [2]:
with open("trail_dict_shortcuts.json") as f:
    trail_dict=json.load(f)
my_trails=dict()

## Preprocesing obrázků

In [3]:
def get_label(img_name, threshold):
    #get game name and task number
    my_split=img_name.split("_")
    my_split1 = my_split[1].split(".")
    trail_id=my_split[0]
    task_no = int(my_split1[0])
    
    if trail_id in my_trails:
        trail = my_trails[trail_id]
    else:    
        trail_name=trail_dict[trail_id]
        trail = Trail(trail_name)
        print(trail.name)

        my_peaks=trail.get_peaks()
        my_trails[trail_id]=trail
    
    peaks = trail.get_peaks()
    if peaks[task_no-1] > threshold:
        label='hard'
    else:
        label='easy'
    return label

In [4]:
def resize_image(img_name, size, source_dir, threshold, given_label='easy'):
    img_path= source_dir + "/" +img_name
    resized_path=DATA_DIR
    
    if not os.path.exists(resized_path):
        os.makedirs(resized_path)

    if threshold == 0:
        label=given_label
    else:
        label=get_label(img_name, threshold)
    label_path = resized_path+"/"+label
    
    if not os.path.exists(label_path):
        os.makedirs(label_path)
        
    resized_path=label_path+'/' +img_name
    with Image.open(img_path) as pil_image:
        im_resized=pil_image.resize((size, size))
        im_resized.save(resized_path, format='PNG')

In [5]:
def convert_pdf_to_png(img_name, tmp_dir, source_dir, save_converted_image=True):
    img_path=source_dir+'/'+img_name
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)        
        
    # get rid of the .pdf
    # and add new format
    my_split=img_name.split(".")
    new_image_name=my_split[0]+".png"
    tmp_path=tmp_dir+'/'+new_image_name
    
    page = convert_from_path(img_path)
    page[0].save(tmp_path, 'PNG')  
    
    if save_converted_image:
        if not os.path.exists(ORIGINALS_DIR):
            os.makedirs(ORIGINALS_DIR)
        page[0].save(ORIGINALS_DIR+'/'+new_image_name, 'PNG')
    
    return new_image_name

In [6]:
'''
Works for trail images that have data from games and can be labeled automaticaly.

Goes through all images in folder, convert to pdf if necessary, shrink to img_size x img_size and label.
'''

tmp_dir = 'tmp'
img_size = IMG_SIZE
source_dir = TRAILS_DIR

# time limit for a task to be easy => 18 minutes
threshold = 18*60
for image in os.listdir(source_dir):
    if (image.endswith(".pdf")):
        new_image_name=convert_pdf_to_png(image, tmp_dir, source_dir)
        resize_image(new_image_name, img_size, tmp_dir, threshold)
    elif(image.startswith(".")):
        continue
    else:
        resize_image(image, img_size, source_dir, threshold)

if os.path.exists(tmp_dir):        
    shutil.rmtree(tmp_dir)

Ztracené židovské město
Ve stínu černé vrány
Obrazy Josefa Temperníka
Staré pověsti české
Před pikolou, za&nbsp;pikolou...
Moravský Manchester
Loupež po telefonu
Královské mysterium
Sedm klíčů
Šeptající javor
Fantom Brna
Příběh Enigmy
Dopis bez adresy
Osmý div světa
Avraham Harshalom


In [8]:
'''
Convert and resize manually labeled data
'''
tmp_dir = 'tmp'
img_size = IMG_SIZE

my_source_dir = DNEM_DIR

threshold = 0
for l in LABELS:
    source_dir = my_source_dir + '/' + l
    for image in os.listdir(source_dir):
        if (image.endswith(".pdf")):
            new_image_name=convert_pdf_to_png(image, tmp_dir, source_dir)
            resize_image(new_image_name, img_size, tmp_dir, threshold, given_label=l)
        elif(image.startswith(".")):
            continue
        else:
            resize_image(image, img_size, source_dir, threshold, given_label=l)
        
if os.path.exists(tmp_dir):
    shutil.rmtree(tmp_dir)


## Split na train/test

In [9]:
# splits into three folders, test folder is empty
if os.path.exists(TRAIN_TEST_DIR):
    shutil.rmtree(TRAIN_TEST_DIR)
splitfolders.ratio(DATA_DIR, output=TRAIN_TEST_DIR, ratio=(.8, 0.2))

Copying files: 427 files [00:00, 3233.16 files/s]


## Konvertace do png

In [10]:
source_dir=ORIGINALS_DIR
for image in os.listdir(source_dir):
    if (image.endswith(".pdf")):
        new_image_name=convert_pdf_to_png(image, source_dir, source_dir)