openhgnn/models/HGNN_AC.py

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from . import BaseModel, register_model

@register_model('HGNN_AC')
class HGNN_AC(BaseModel):
    r"""
    Desctiption
    -----------
    HGNN_AC was introduced in `HGNN_AC <https://dl.acm.org/doi/10.1145/3442381.3449914>`__.
        
    It included four parts:

    - Pre-learning of Topological Embedding
        HGNN-AC first obtains more comprehensive node sequences by random walk according to the frequently used multiple meta-paths, 
        and then feeds these sequences to the skip-gram model to learn node embeddings :math:`H`.
        
    - Attribute Completion with Attention Mechanism
        HGNN-AC adopts a masked attention mechanism which means we only calculate :math:`e_{vu}` for nodes :math:`u\in{N_v^+}`, 
        where :math:`u\in{N_v^+}` denotes the first-order neighbors of node :math:`v` 
        in set :math:`V^+`, where :math:`V^+` is the set of nodes with attributes.
        
        .. math::
           e_{vu}=\sigma(h_v^{T}Wh_u)
        
        where :math:`W` is the parametric matrix, and :math:`\sigma` an activation function.
    
        Then, softmax function is applied to get normalized weighted coefficient :math:`a_{vu}`

        .. math::
           a_{vu}=softmax(e_{vu})=\frac{exp(e_{vu})}{\sum_{s\in{N_v^+}}{exp(e_{vs})}}

        HGNN-AC can perform weighted aggregation of attributes
        for node :math:`v`  according to weighted coefficient :math:`a_{vu}`  :

        .. math::
           X_v^C=\sum_{u\in{N_v^+}}{a_{vu}x_u}

        where :math:`N_v^+` denotes the set of neighbors of node :math:`v\in{V^+}`,
        and :math:`x_u` denotes the attributes of nodes :math:`u`.

        .. _here:
        
        Specially, the attention process is extended to a multi-head attention
        to stabilize the learning process and reduce the high variance

        .. math::
           X_v^C=mean(\sum_k^K {\sum_{u\in{N_v^+}}{a_{vu}x_u}})

        where :math:`K` means that we perform :math:`K` independent attention process.

    - Dropping some Attributes
        To be specific, for nodes in :math:`V^+`, HGNN-AC randomly divides them into two parts
        :math:`V_{drop}^+` and :math:`V_{keep}^+` according to a small ratio :math:`\alpha`, i.e. :math:`|V_{drop}^+|=\alpha|V^+|`.
        HGNN-AC first drops attributes of nodes in :math:`V_{drop}^+` and then 
        reconstructs these attributes via attributes of nodes :math:`V_{drop}^+` by conducting
        attribute completion.
        
        .. math::
           X_v^C=mean(\sum_k^K {\sum_{u\in{V_{keep}^+ \cap V_i^+}}{a_{vu}x_u}})

        It introduced a weakly supervised loss to optimize the parameters of attribute completion 
        and use euclidean distance as the metric to design the loss function as:
    
        .. math::
           L_{completion}=\frac{1}{|V_{drop}^+|}\sum_{i \in V_{drop}^+} \sqrt{(X_i^C-X_i)^2}
    
    - Combination with HIN Model
        Now, we have completed attributes nodes in :math:`V^-`(the set of nodes without attribute), and the raw attributes nodes in :math:`V+`, 
        Wthen the new attributes of all nodes are defined as:

        .. math::
           X^{new}=\{X_i^C,X_j|\forall i \in V^-, \forall j \in V^+\}

        the new attributes :math:`X^{new}`, together with network topology :math:`A`, as
        a new graph, are sent to the HIN model:

        .. math::
           \overline{Y}=\Phi(A,X^{new})
           L_{prediction}=f(\overline{Y},Y)
        
        where :math:`\Phi` denotes an arbitrary HINs model.

        the overall model can be optimized via back propagation in an end-to-end
        manner:

        .. math::
           L=\lambda L_{completion}+L_{prediction}
    
        where :math:`\lambda` is a weighted coefficient to balance these two parts.
        
    Parameters
    ----------
    in_dim: int
        nodes' topological embedding dimension
    hidden_dim: int
        hidden dimension 
    dropout: float
        the dropout rate of neighbor nodes dropout
    activation: callable activation function
        the activation function used in HGNN_AC.  default: ``F.elu``
    num_heads: int
        the number of heads in attribute completion with attention mechanism
    """
    @classmethod
    def build_model_from_args(cls, args, hg):
        return cls(in_dim = hg.nodes[hg.ntypes[0]].data['emb'].shape[1], 
                                hidden_dim = args.attn_vec_dim, 
                                dropout = args.dropout, activation = F.elu, 
                                num_heads = args.num_heads,
                                cuda = False if args.device == torch.device('cpu') else True)
    def __init__(self, in_dim, hidden_dim, dropout, activation, num_heads, cuda):
        super(HGNN_AC, self).__init__()
        self.dropout = dropout
        self.attentions = [AttentionLayer(in_dim, hidden_dim, dropout, activation, cuda) for _ in range(num_heads)]

        for i, attention in enumerate(self.attentions):
            self.add_module('attention_{}'.format(i), attention)

    def forward(self, bias, emb_dest, emb_src, feature_src):
        r"""
        Description
        -----------
        This is the forward part of model HGNN_AC

        Parameters
        ----------
        bias: matrix
            adjacency matrix related to the source nodes
        emb_dest: matrix
            embeddings of the destination node
        emb_src: matrix
            embeddings of the source node
        feature_src: matrix
            features of the source node
            
        Returns
        -------
        features: matrix
            the new features of the type of node
        """
        
        #Attribute Completion with Attention Mechanism
        adj = F.dropout(bias, self.dropout, training=self.training)
        #x = sum_k(x_v)
        x = torch.cat([att(adj, emb_dest, emb_src, feature_src).unsqueeze(0) for att in self.attentions], dim=0)

        #X_{v}^{C} = mean(x)
        return torch.mean(x, dim=0, keepdim=False)


class AttentionLayer(nn.Module):
    r"""
    Description
    -------------------
    This is the attention process used in HGNN\_AC. For more details, you can check here_.
    
    Parameters
    -------------------
    in_dim: int
        nodes' topological embedding dimension
    hidden_dim: int
        hidden dimension
    dropout: float
        the drop rate used in the attention
    activation: callable activation function
        the activation function used in HGNN_AC.  default: ``F.elu``
    """
    def __init__(self, in_dim, hidden_dim, dropout, activation, cuda=False):
        super(AttentionLayer, self).__init__()
        self.dropout = dropout
        self.activation = activation
        self.is_cuda = cuda

        self.W = nn.Parameter(nn.init.xavier_normal_(
            torch.Tensor(in_dim, hidden_dim).type(torch.cuda.FloatTensor if cuda else torch.FloatTensor),
            gain=np.sqrt(2.0)), requires_grad=True)
        self.W2 = nn.Parameter(nn.init.xavier_normal_(torch.Tensor(hidden_dim, hidden_dim).type(
            torch.cuda.FloatTensor if cuda else torch.FloatTensor), gain=np.sqrt(2.0)),
            requires_grad=True)

        self.leakyrelu = nn.LeakyReLU(0.2)

    def forward(self, bias, emb_dest, emb_src, feature_src):
        r"""
        Description
        ----------------
        This is the forward part of the attention process.
        
        Parameters
        --------------
        bias: matrix
            the processed adjacency matrix related to the source nodes
        emb_dest: matrix
            the embeddings of the destination nodes
        emb_src: matrix
            the embeddings of the source nodes
        feature_src: matrix
            the features of the source nodes
        
        Returns
        ------------
        features: matrix
            the new features of the nodes
        """
        h_1 = torch.mm(emb_src, self.W)
        h_2 = torch.mm(emb_dest, self.W)

        #contribution of the neighbor nodes using a masked attention
        #e_{vu} = activation(h_v * W * h_u)
        e = self.leakyrelu(torch.mm(torch.mm(h_2, self.W2), h_1.t()))
        zero_vec = -9e15 * torch.ones_like(e)
        attention = torch.where(bias > 0, e, zero_vec)
        
        #get normalized weighted coefficient
        #a_{vu} = softmax(e_{vu})
        attention = F.softmax(attention, dim=1)
        attention = F.dropout(attention, self.dropout, training=self.training)
        #x_v = sum(a_{vu} * x_u)
        h_prime = torch.matmul(attention, feature_src)

        #return a new attribute
        return self.activation(h_prime)