In [1]:
%load_ext autoreload
%autoreload 2

In [15]:
from src.game import LeducPokerState
from src.cfr import CFRUtility, Exploitability

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import copy

torch.autograd.set_detect_anomaly(False)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f8df4a620b8>

In [77]:
# class ReservoirSampler:
#     """
#     Select uniformly 'size' elements from a stream of unknown length
#     """
    
#     def __init__( self, size ):
#         self.index = 0
#         self.size = size
#         self.reservoir = []
    
#     def add_sample( self, sample ):
#         self.index += 1
#         if len( self.reservoir ) < self.size:
#             self.reservoir.append( sample )
#             return True
        
#         j = np.random.randint( 0, self.index )
#         include = j < self.size
#         if include:
#             self.reservoir[j] = sample
#         return include

class ReservoirSampler:
    """
    Replaces randomly old samples with new samples
    """
    
    def __init__( self, size ):
        self.index = 0
        self.size = size
        self.reservoir = []
    
    def add_sample( self, sample ):
        self.index += 1
        if len( self.reservoir ) < self.size:
            self.reservoir.append( sample )
            return True
        
        j = np.random.randint( 0, len( self.reservoir ) )
        self.reservoir[j] = sample
        return True

"""
Deep CFR overview
input: hole + community card embeddings, betting history
    3 cards; 2 betting rounds x ( 3 actions, 3 bet sizes relative to pot size )
    
    do mccfr traversal, use NN theta_t to predict regrets and form strategy, save immediate regrets for future training
    save opponent's action distribution for average strategy NN learning
    after K steps of cfr for each player retrain theta_t+1 from scratch
"""

def encode_card( card ):
    if card == "":
        return -1
    if card == "J":
        return 0
    if card == "Q":
        return 1
    if card == "K":
        return 2
    raise Exception( "Unknown card" )
    
def state_to_dcfr_observation( state, player ):
    """
    Converts Leduc Poker state to input of Deep CFR neural network
    """
    hole = torch.tensor( encode_card( state.hole_cards[player][0] ) )
    if len( state.community_card ) == 0:
        community = torch.tensor( encode_card( "" ) )
    else:
        community = torch.tensor( encode_card( state.community_card[0] ) )
        
    bets = np.zeros( 8 )
    
    pots = [ state.ante, state.ante ]
    
    preflop_history = state.stage_history.get( LeducPokerState.Stage.preflop )
    if preflop_history:
        for i, action in enumerate( preflop_history ):
            current_player = i % 2
            other_player = 1 - current_player
            if action == "c":
                update = pots[other_player] - pots[current_player]
                bets[i] = ( update ) / sum( pots )
                pots[current_player] += update
            else:
                update = pots[other_player] - pots[current_player] + state.preflop_bet
                bets[i] = ( update ) / sum( pots )
                pots[current_player] += update
        
    flop_history = state.stage_history.get( LeducPokerState.Stage.flop )
    if flop_history:
        for i, action in enumerate( flop_history ):
            current_player = i % 2
            other_player = 1 - current_player
            if action == "c":
                update = pots[other_player] - pots[current_player]
                bets[4+i] = ( update ) / sum( pots )
                pots[current_player] += update
            else:
                update = pots[other_player] - pots[current_player] + state.flop_bet
                bets[4+i] = ( update ) / sum( pots )
                pots[current_player] += update
        
    return [ hole, community ], torch.tensor( bets )


In [78]:
# example 1 of Deep CFR observation
state = LeducPokerState.initial_state()
state = state.next_state( "KQ" ) # deal holecards
state = state.next_state( "b" )
state = state.next_state( "c" )
state_to_dcfr_observation( state, 0 )

([tensor(2), tensor(-1)],
 tensor([1.0000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        dtype=torch.float64))

In [79]:
# example 2 of Deep CFR observation
state = LeducPokerState.initial_state()
state = state.next_state( "KQ" ) # deal holecards
state = state.next_state( "b" )
state = state.next_state( "c" )
state = state.next_state( "K" )
state = state.next_state( "b" )
state = state.next_state( "b" )
state = state.next_state( "c" )
state_to_dcfr_observation( state, 0 )

([tensor(2), tensor(2)],
 tensor([1.0000, 0.5000, 0.0000, 0.0000, 0.6667, 0.8000, 0.2222, 0.0000],
        dtype=torch.float64))

In [80]:
class CardEmbedding( nn.Module ):
    def __init__( self, dim ):
        super().__init__()
        self.cards = 3 # J, Q, K
        self.embedding = nn.Embedding( self.cards, dim )
        
    def forward( self, x ):
        valid = x.ge( 0 ).float()
        x = x.clamp( min=0 )
        
        return self.embedding( x ) * valid

class DeepCFRModel( nn.Module ):
    def __init__( self, embed_dim ):
        super().__init__()
        self.hole_embedding = CardEmbedding( embed_dim )
        self.community_embedding = CardEmbedding( embed_dim )
        
        self.card1 = nn.Linear( embed_dim * 2, 64 )
        self.bet1 = nn.Linear( 8 * 2, 64 )
        
        self.comb = nn.Linear( 64 * 2, 64 )
        self.action_head = nn.Linear( 64, 3 ) 
    
    def forward( self, cards, bets ):
        """
        cards: [ hole, community ] ... 0 - J, 1 - Q, 2 - K, -1 == no card
        bets: [ preflop_first_action, preflop_second, preflop_third, ... ]
        """
        card_embs = torch.cat( [ self.hole_embedding( cards[0] ), self.community_embedding( cards[1] ) ] )
        
        x = F.relu( self.card1( card_embs ) )
        bet_occured = bets.ge( 0 )
        y = F.relu( self.bet1( torch.cat( [ bets.float(), bet_occured.float() ] ) ) )
        z = torch.cat( [ x, y ] )
        z = F.relu( self.comb( z ) )
        return self.action_head( z ) # returns [ fold, call, bet ] regrets

In [86]:
def train_nn( net, regret_memory ):
    """
    Retrains neural network `net` with samples from `regret_memory`
    """
    reservoir = regret_memory.reservoir
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam( net.parameters(), lr=0.001 )
    
    for i in range( 100 ):
        optimizer.zero_grad()
        for x, y in reservoir:
            output = net( *x )
    #         print( x, output, y )
            loss = criterion( output, torch.tensor( y ).float() )
    #             print( loss )
            loss.backward()
        torch.nn.utils.clip_grad_value_( net.parameters(), 1e6 )
        optimizer.step()

def mask_available_actions( regret, available_actions ):
    """
    Sets regret for unavailable actions to zero
    """
    masked_regret = np.zeros( regret.shape )
    for action in available_actions:
        i = action_to_index( action )
        masked_regret[i] = regret[i]
    return masked_regret

def action_to_index( action ):
    if action == "f":
        return 0
    if action == "c":
        return 1
    if action == "b":
        return 2
    raise Exception( f"Uknown action {action}" )
    
def index_to_action( index ):
    if index == 0:
        return "f"
    if index == 1:
        return "c"
    if index == 2:
        return "b"
    raise Exception( f"Unknown index {index}" )
    
def deep_cfr_traverse( state, update_player, nn_list, regret_memory, strategy_memory, t ):
    if state.is_terminal():
        return state.get_payoff( update_player )
    
    if state.is_chance():
        next_state = state.next_state()
        return deep_cfr_traverse( next_state, update_player, nn_list, regret_memory, strategy_memory, t )

    available_actions = state.get_available_actions()
    cards, bets = state_to_dcfr_observation( state, state.get_player() )
    nn = nn_list[state.current_player]
    nn_regret = nn( cards, bets )
    
    if np.any( np.isnan( nn_regret.detach().numpy() ) ):
        print( "nn_regret", nn_regret )
        print( cards, bets )
        print( "State:", state.stage_history, state.hole_cards, state.community_card )
        print( "Update player:", update_player )
    nn_regret[nn_regret < 0] = 0
    nn_regret = mask_available_actions( nn_regret, available_actions )
    if np.isclose( 0, sum( nn_regret ) ):
        nn_regret = np.ones( 3 )
        nn_regret = mask_available_actions( nn_regret, available_actions )

    prob_distribution = nn_regret / sum( nn_regret )

    if state.current_player == update_player:
        expected_utility = 0
        action_utility = dict()
        for i, action in enumerate( available_actions ):
            p_a = prob_distribution[action_to_index( action )]
            next_state = state.next_state( action )
            action_utility[action] = deep_cfr_traverse( next_state, update_player, nn_list, regret_memory, strategy_memory, t )
            expected_utility += p_a * action_utility[action]
        regrets = np.zeros( 3 )
        for action in available_actions:
            i = action_to_index( action )
            regrets[i] = action_utility[action] - expected_utility
        regret_memory[update_player].add_sample( ( copy.deepcopy( [ cards, bets ] ), regrets ) )
            
        return expected_utility
    
    #case: state.current_player != update_player
    strategy_memory.add_sample( ( copy.deepcopy( [ cards, bets ] ), prob_distribution ) )
    i = np.random.choice( range( 3 ), p=prob_distribution )
    action = index_to_action( i )
    next_state = state.next_state( action )
    return deep_cfr_traverse( next_state, update_player, nn_list, regret_memory, strategy_memory, t )

def nn_to_strategy( nn ):
    def strategy( state ):
        observation = state_to_dcfr_observation( state, state.get_player() )
        available_actions = state.get_available_actions()
        output = nn( *observation )
        strategy_probs = np.zeros( len( available_actions ) )
        for i, action in enumerate( available_actions ):
            strategy_probs[i] = float( output[action_to_index( action )] )
        strategy_probs[strategy_probs < 0] = 0
        if np.isclose( sum( strategy_probs ), 0 ):
            strategy_probs = np.ones( len( available_actions ) )
        return strategy_probs / sum( strategy_probs )
    return strategy    

In [87]:
iterations = 100
traversals_per_iteration = 100
regret_memory_size = 5000
strategy_memory_size = 10000
kwargs = { "log_interval": 1 }

nn1 = DeepCFRModel( 64 )
nn2 = DeepCFRModel( 64 )
nn_list = [ nn1, nn2 ]
regret_memory = [ ReservoirSampler( regret_memory_size ), ReservoirSampler( regret_memory_size ) ]
strategy_memory = ReservoirSampler( strategy_memory_size )
total_iterations = 0


for i in range( iterations ):
    for p in range( 2 ):
        for k in range( traversals_per_iteration ):
            deep_cfr_traverse( LeducPokerState.initial_state(), p, nn_list, regret_memory, strategy_memory, i + 1 )
#         nn_list[p] = DeepCFRModel( 64 )
        train_nn( nn_list[p], regret_memory[p] )
        
    total_iterations += 1
    if total_iterations % kwargs.get( "log_interval", 10 ) == 0:
        print( id( nn_list[0] ), id( nn_list[1] ) )
        strategy1 = nn_to_strategy( nn1 )
        strategy2 = nn_to_strategy( nn2 )
        ev = CFRUtility.evaluate_strategies( LeducPokerState.initial_state(), [ strategy1, strategy2 ], [True,True] )
        print( f"Iteration {total_iterations}, EV = {ev}" )
        print( len( regret_memory[0].reservoir ), len( regret_memory[1].reservoir ) )
        
    print( f"Iteration {i} finished" )

140246386593576 140246386593464
Iteration 1, EV = -1.068761245693822
593 312
Iteration 0 finished
140246386593576 140246386593464
Iteration 2, EV = -0.8167382690278326
1002 611
Iteration 1 finished
140246386593576 140246386593464
Iteration 3, EV = -0.711158215088711
1580 909
Iteration 2 finished
140246386593576 140246386593464
Iteration 4, EV = -0.41032412668897
2119 1211
Iteration 3 finished
140246386593576 140246386593464
Iteration 5, EV = -0.29095699545158343
2669 1512
Iteration 4 finished
140246386593576 140246386593464
Iteration 6, EV = -0.29475472330347074
3210 1818
Iteration 5 finished
140246386593576 140246386593464
Iteration 7, EV = -0.09952204278861071
3713 2105
Iteration 6 finished
140246386593576 140246386593464
Iteration 8, EV = -0.17333264610231167
4214 2373
Iteration 7 finished
140246386593576 140246386593464
Iteration 9, EV = -0.31943908882674776
4672 2645
Iteration 8 finished
140246386593576 140246386593464
Iteration 10, EV = -0.16728541695093474
5000 2901
Iteration 9 

KeyboardInterrupt: 

In [88]:
len( regret_memory[0].reservoir )

5000

In [89]:
strategy1 = nn_to_strategy( nn1 )
strategy2 = nn_to_strategy( nn2 )

def combined_nn_strategy( state ):
    if state.get_player() == 0:
        return strategy1( state )
    return strategy2( state )

In [90]:
from src.cfr import Exploitability

exploit = Exploitability( LeducPokerState.initial_state(), combined_nn_strategy, pass_state=True )
exploit.compute()
print( "Deep CFR exploitability =", exploit.get_ev()["exploitability"] )

Deep CFR exploitability = 3.6713903056225563
