Source code for lagom.networks.mdn_head

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
from torch.distributions import Normal

from lagom.networks import Module
from lagom.networks import ortho_init


[docs]class MDNHead(Module):
    def __init__(self, in_features, out_features, num_density, **kwargs):
        super().__init__(**kwargs)
        
        self.in_features = in_features
        self.out_features = out_features
        self.num_density = num_density
        
        self.pi_head = nn.Linear(in_features, out_features*num_density)
        ortho_init(self.pi_head, weight_scale=0.01, constant_bias=0.0)
        self.mean_head = nn.Linear(in_features, out_features*num_density)
        ortho_init(self.mean_head, weight_scale=0.01, constant_bias=0.0)
        self.logvar_head = nn.Linear(in_features, out_features*num_density)
        ortho_init(self.logvar_head, weight_scale=0.01, constant_bias=0.0)
        
[docs]    def forward(self, x):
        logit_pi = self.pi_head(x).view(-1, self.num_density, self.out_features)
        mean = self.mean_head(x).view(-1, self.num_density, self.out_features)
        logvar = self.logvar_head(x).view(-1, self.num_density, self.out_features)
        std = torch.exp(0.5*logvar)
        return logit_pi, mean, std
        
[docs]    def loss(self, logit_pi, mean, std, target):
        r"""Calculate the MDN loss function. 
        
        The loss function (negative log-likelihood) is defined by:
        
        .. math::
            L = -\frac{1}{N}\sum_{n=1}^{N}\ln \left( \sum_{k=1}^{K}\prod_{d=1}^{D} \pi_{k}(x_{n, d})
            \mathcal{N}\left( \mu_k(x_{n, d}), \sigma_k(x_{n,d}) \right) \right)
            
        For better numerical stability, we could use log-scale:
        
        .. math::
            L = -\frac{1}{N}\sum_{n=1}^{N}\ln \left( \sum_{k=1}^{K}\exp \left\{ \sum_{d=1}^{D} 
            \ln\pi_{k}(x_{n, d}) + \ln\mathcal{N}\left( \mu_k(x_{n, d}), \sigma_k(x_{n,d}) 
            \right) \right\} \right) 
        
        .. note::
        
            One should always use the second formula via log-sum-exp trick. The first formula
            is numerically unstable resulting in +/- ``Inf`` and ``NaN`` error. 
        
        The log-sum-exp trick is defined by
        
        .. math::
            \log\sum_{i=1}^{N}\exp(x_i) = a + \log\sum_{i=1}^{N}\exp(x_i - a)
            
        where :math:`a = \max_i(x_i)`
        
        Args:
            logit_pi (Tensor): the logit of mixing coefficients, shape [N, K, D]
            mean (Tensor): mean of Gaussian mixtures, shape [N, K, D]
            std (Tensor): standard deviation of Gaussian mixtures, shape [N, K, D]
            target (Tensor): target tensor, shape [N, D]

        Returns:
            Tensor: calculated loss
        """
        # target shape [N, D] to [N, 1, D]
        target = target.unsqueeze(1)
        
        log_pi = F.log_softmax(logit_pi, dim=1)
        
        dist = Normal(mean, std)
        log_probs = dist.log_prob(target)
        
        # [N, K, D] to [N, K]
        joint_log_probs = torch.sum(log_pi + log_probs, dim=-1, keepdim=False)
        # [N, K] to [N]
        loss = torch.logsumexp(joint_log_probs, dim=-1, keepdim=False)
        loss = -loss.mean(0)
        
        return loss
    
[docs]    def sample(self, logit_pi, mean, std, tau=1.0):
        r"""Sample from Gaussian mixtures using reparameterization trick.
        
        - Firstly sample categorically over mixing coefficients to determine a specific Gaussian
        - Then sample from selected Gaussian distribution
        
        Args:
            logit_pi (Tensor): the logit of mixing coefficients, shape [N, K, D]
            mean (Tensor): mean of Gaussian mixtures, shape [N, K, D]
            std (Tensor): standard deviation of Gaussian mixtures, shape [N, K, D]
            tau (float): temperature during sampling, it controls uncertainty. 
                * If :math:`\tau > 1`: increase uncertainty
                * If :math:`\tau < 1`: decrease uncertainty
        
        Returns:
            Tensor: sampled data with shape [N, D]
        """
        N, K, D = logit_pi.shape
        pi = F.softmax(logit_pi/tau, dim=1)
        # [N, K, D] to [N*D, K]
        pi = pi.permute(0, 2, 1).view(-1, K)
        mean = mean.permute(0, 2, 1).view(-1, K)
        std = std.permute(0, 2, 1).view(-1, K)
        
        pi_samples = Categorical(pi).sample()
        
        mean = mean[torch.arange(N*D), pi_samples]
        std = std[torch.arange(N*D), pi_samples]
        eps = torch.randn_like(std)
        samples = mean + eps*std*np.sqrt(tau)
        samples = samples.view(N, D)
        
        return samples