Nodes

This class is used to get the nodes of the word graph.

Methods:

Name	Description
`__init__`	str, dataset: List[str], level: int, k: int, c: int, word2vec_model_path: str, mlm_model_path: str, mlm_model_type: str = 'roberta') The constructor of the Nodes class.
`get_similar_nodes`	str) -> List[str] This method is used to get the similar nodes of a word.
`get_context_nodes`	str) -> List[str] This method is used to get the context nodes of a word.
`get_node_features`	Dict[str, List[str]]) This method is used to get the features of the nodes of the word graph.

Source code in semantics/graphs/temporal_graph.py

class Nodes:
    """
    This class is used to get the nodes of the word graph.

    methods:
        __init__(self, target_word: str, dataset: List[str], level: int, k: int, c: int, word2vec_model_path: str, mlm_model_path: str, mlm_model_type: str = 'roberta')
            The constructor of the Nodes class.
        get_similar_nodes(self, word: str) -> List[str]
            This method is used to get the similar nodes of a word.
        get_context_nodes(self, word: str) -> List[str]
            This method is used to get the context nodes of a word.
        get_nodes(self) -> Dict[str, List[str]]
            This method is used to get the nodes of the word graph.
        get_node_features(self, nodes: Dict[str, List[str]])
            This method is used to get the features of the nodes of the word graph.
    """
    def __init__(
            self,
            target_word: str,
            dataset: List[str],
            level: int,
            k: int,
            c: int,
            word2vec_model: Word2VecInference,
            mlm_model: Union[RobertaInference, BertInference]
            ):

        """
        Args:
            target_word (str): the word to get the nodes for
            dataset (List[str]): the sentences to get the nodes from
            level (int): the level of the graph to get
            k (int): the number of similar nodes to get for each occurrence of the target word
            c (int): the number of context nodes to get for the target word
            word2vec_model (Word2VecInference): the word2vec model's Inference class
            mlm_model (RobertaInference, BertInference): the MLM model's Inference class
        """

        self.target_word = target_word
        self.dataset = dataset
        self.k = k
        self.c = c
        self.level = level
        self.word2vec = word2vec_model
        self.mlm = mlm_model


    def get_similar_nodes(self, word: str, keep_k: int = 50) -> List[str]:
        """
        This method is used to get the similar nodes of a word using the MLM model.

        Args:
            word (str): the word to get the similar nodes for
            keep_k (int): the number of similar nodes to keep for each occurrence of the word. Default: 50.

        Returns:
            similar_nodes (List[str]): the list of similar nodes of the word
        """
        print(f'Getting the similar nodes of the word: ')
        progress_bar = tqdm.tqdm(
            range(len(self.dataset)), 
            desc=f"{word}", 
            dynamic_ncols=True
            )
        similar_nodes = []
        for sentence in self.dataset:
            similar_nodes += self.mlm.get_top_k_words(word, sentence, self.k)
            progress_bar.update(1)

        similar_nodes = list(set(similar_nodes))
        similar_nodes = most_frequent(similar_nodes, keep_k)
        print(f'{len(similar_nodes)} similar nodes found.', '\n')
        return similar_nodes

    def get_context_nodes(self, word: str) -> List[str]:
        """
        This method is used to get the context nodes of a word using the word2vec model.

        Args:
            word (str): the word to get the context nodes for

        Returns:
            context_nodes (List[str]): the list of context nodes of the word
        """
        print(f'Getting the context nodes of the word "{word}" ...')
        context_nodes, _ = self.word2vec.get_top_k_words(word, self.c)

        print(f'{len(list(set(context_nodes)))} context nodes found.', '\n')
        return list(set(context_nodes))

    def get_nodes(self) -> Dict[str, List[str]]:
        """
        This method is used to get the nodes of the word graph (similar nodes, context nodes, and target node).

        Returns:
            nodes (Dict[str, List[str]]): the nodes of the word graph
        """
        nodes = {'target_node': [], 'similar_nodes': [], 'context_nodes': []}
        for level in range(self.level):
            print(f'Getting the nodes of level {level} ...')
            if level == 0:
                similar_nodes = self.get_similar_nodes(self.target_word, keep_k= 5)
                context_nodes = self.get_context_nodes(self.target_word)

                nodes['similar_nodes'].append(similar_nodes)
                nodes['context_nodes'].append(context_nodes)
                nodes['target_node'].append([self.target_word])

            else:
                similar_nodes = []
                context_nodes = []
                for word in nodes['similar_nodes'][level-1]:
                    similar_nodes += self.get_similar_nodes(word, keep_k= 5)
                    context_nodes += self.get_context_nodes(word)


                for word in nodes['context_nodes'][level-1]:
                    similar_nodes += self.get_similar_nodes(word, keep_k= 5)
                    context_nodes += self.get_context_nodes(word)

                nodes['similar_nodes'].append(similar_nodes)
                nodes['context_nodes'].append(context_nodes)          
        return nodes

    def get_node_features(self, nodes: Dict[str, List[str]]):
        """
        This method is used to get the features of the nodes of the word graph.

        Args:
            nodes (Dict[str, List[str]]): the nodes of the word graph

        Returns:
            index (Dict[str, Dict[int, str]]): the index of the nodes of the word graph. The index contains the 'index_to_key' and 'key_to_index' mapping dictionaries. Example: in the index_to_key dictionary {0: target_word}, and in the key_to_index dictionary {target_word: 0}.
            node_features (np.ndarray): the features of the nodes of the word graph of shape (num_nodes, 3) where num_nodes is the number of nodes in the graph. The features are:

                - node_type: the type of the node (target: 0, similar: 1, context: 2).

                - node_level: the level of the node in the graph. The target node is level 0.

                - frequency: the frequency of the word node in the dataset.
            embeddings (np.ndarray): the embeddings of the nodes of the word graph from the MLM model, of shape (num_nodes, 768).

        Examples:
            >>> word2vec = Word2VecInference('word2vec.model')
            >>> mlm = RobertaInference('MLM_roberta')
            >>> n = Nodes(target_word='sentence', dataset=['this is a sentence', 'this is another sentence'], level=3, k=2, c=2, word2vec_model = word2vec, mlm_model = mlm)
            >>> nodes = n.get_nodes()
            >>> index, node_features, embeddings = n.get_node_features(nodes)
            >>> print(index)
            {'index_to_key': {0: 'sentence', 1: 'this', 2: 'is', 3: 'a', 4: 'another'}, 'key_to_index': {'sentence': 0, 'this': 1, 'is': 2, 'a': 3, 'another': 4}
            >>> print(node_features)
            [[0, 0, 2], [1, 1, 2], [1, 1, 2], [1, 1, 2], [2, 1, 2]]
            >>> print(embeddings.shape)
            (5, 768)
        """
        index_to_key = {}
        key_to_index = {}
        node_types = []
        node_levels = []
        frequencies = []
        embeddings = []
        count = 0
        for node_type in ['target_node', 'similar_nodes', 'context_nodes']:
            for level in range(len(nodes[node_type])):
                for node in nodes[node_type][level]:
                    index_to_key[count] = node
                    key_to_index[node] = count
                    count += 1 
                    if node_type == 'target_node':
                        node_types.append(0)
                    elif node_type == 'similar_nodes':
                        node_types.append(1)
                    else:
                        node_types.append(2)
                    node_levels.append(level)
                    frequencies.append(count_occurence(self.dataset, node))
                    embeddings.append(self.mlm.get_embedding(main_word=node).mean(axis=0))

        embeddings = np.array(embeddings)
        node_features = np.stack([node_types, node_levels, frequencies]).T
        # node_features = np.concatenate((node_features, embeddings), axis=1)

        index = {'index_to_key': index_to_key, 'key_to_index': key_to_index}
        return index, node_features, embeddings

`init(target_word, dataset, level, k, c, word2vec_model, mlm_model)`

Parameters:

Name	Type	Description	Default
`target_word`	`str`	the word to get the nodes for	required
`dataset`	`List[str]`	the sentences to get the nodes from	required
`level`	`int`	the level of the graph to get	required
`k`	`int`	the number of similar nodes to get for each occurrence of the target word	required
`c`	`int`	the number of context nodes to get for the target word	required
`word2vec_model`	`Word2VecInference`	the word2vec model's Inference class	required
`mlm_model`	`(RobertaInference, BertInference)`	the MLM model's Inference class	required

Source code in semantics/graphs/temporal_graph.py

def __init__(
        self,
        target_word: str,
        dataset: List[str],
        level: int,
        k: int,
        c: int,
        word2vec_model: Word2VecInference,
        mlm_model: Union[RobertaInference, BertInference]
        ):

    """
    Args:
        target_word (str): the word to get the nodes for
        dataset (List[str]): the sentences to get the nodes from
        level (int): the level of the graph to get
        k (int): the number of similar nodes to get for each occurrence of the target word
        c (int): the number of context nodes to get for the target word
        word2vec_model (Word2VecInference): the word2vec model's Inference class
        mlm_model (RobertaInference, BertInference): the MLM model's Inference class
    """

    self.target_word = target_word
    self.dataset = dataset
    self.k = k
    self.c = c
    self.level = level
    self.word2vec = word2vec_model
    self.mlm = mlm_model

`get_context_nodes(word)`

This method is used to get the context nodes of a word using the word2vec model.

Parameters:

Name	Type	Description	Default
`word`	`str`	the word to get the context nodes for	required

Returns:

Name	Type	Description
`context_nodes`	`List[str]`	the list of context nodes of the word

Source code in semantics/graphs/temporal_graph.py

def get_context_nodes(self, word: str) -> List[str]:
    """
    This method is used to get the context nodes of a word using the word2vec model.

    Args:
        word (str): the word to get the context nodes for

    Returns:
        context_nodes (List[str]): the list of context nodes of the word
    """
    print(f'Getting the context nodes of the word "{word}" ...')
    context_nodes, _ = self.word2vec.get_top_k_words(word, self.c)

    print(f'{len(list(set(context_nodes)))} context nodes found.', '\n')
    return list(set(context_nodes))

`get_node_features(nodes)`

This method is used to get the features of the nodes of the word graph.

Parameters:

Name	Type	Description	Default
`nodes`	`Dict[str, List[str]]`	the nodes of the word graph	required

Returns:

Name	Type	Description
`index`	`Dict[str, Dict[int, str]]`	the index of the nodes of the word graph. The index contains the 'index_to_key' and 'key_to_index' mapping dictionaries. Example: in the index_to_key dictionary {0: target_word}, and in the key_to_index dictionary {target_word: 0}.
`node_features`	`ndarray`	the features of the nodes of the word graph of shape (num_nodes, 3) where num_nodes is the number of nodes in the graph. The features are: node_type: the type of the node (target: 0, similar: 1, context: 2). node_level: the level of the node in the graph. The target node is level 0. frequency: the frequency of the word node in the dataset.
`embeddings`	`ndarray`	the embeddings of the nodes of the word graph from the MLM model, of shape (num_nodes, 768).

Examples:

>>> word2vec = Word2VecInference('word2vec.model')
>>> mlm = RobertaInference('MLM_roberta')
>>> n = Nodes(target_word='sentence', dataset=['this is a sentence', 'this is another sentence'], level=3, k=2, c=2, word2vec_model = word2vec, mlm_model = mlm)
>>> nodes = n.get_nodes()
>>> index, node_features, embeddings = n.get_node_features(nodes)
>>> print(index)
{'index_to_key': {0: 'sentence', 1: 'this', 2: 'is', 3: 'a', 4: 'another'}, 'key_to_index': {'sentence': 0, 'this': 1, 'is': 2, 'a': 3, 'another': 4}
>>> print(node_features)
[[0, 0, 2], [1, 1, 2], [1, 1, 2], [1, 1, 2], [2, 1, 2]]
>>> print(embeddings.shape)
(5, 768)

Source code in semantics/graphs/temporal_graph.py

def get_node_features(self, nodes: Dict[str, List[str]]):
    """
    This method is used to get the features of the nodes of the word graph.

    Args:
        nodes (Dict[str, List[str]]): the nodes of the word graph

    Returns:
        index (Dict[str, Dict[int, str]]): the index of the nodes of the word graph. The index contains the 'index_to_key' and 'key_to_index' mapping dictionaries. Example: in the index_to_key dictionary {0: target_word}, and in the key_to_index dictionary {target_word: 0}.
        node_features (np.ndarray): the features of the nodes of the word graph of shape (num_nodes, 3) where num_nodes is the number of nodes in the graph. The features are:

            - node_type: the type of the node (target: 0, similar: 1, context: 2).

            - node_level: the level of the node in the graph. The target node is level 0.

            - frequency: the frequency of the word node in the dataset.
        embeddings (np.ndarray): the embeddings of the nodes of the word graph from the MLM model, of shape (num_nodes, 768).

    Examples:
        >>> word2vec = Word2VecInference('word2vec.model')
        >>> mlm = RobertaInference('MLM_roberta')
        >>> n = Nodes(target_word='sentence', dataset=['this is a sentence', 'this is another sentence'], level=3, k=2, c=2, word2vec_model = word2vec, mlm_model = mlm)
        >>> nodes = n.get_nodes()
        >>> index, node_features, embeddings = n.get_node_features(nodes)
        >>> print(index)
        {'index_to_key': {0: 'sentence', 1: 'this', 2: 'is', 3: 'a', 4: 'another'}, 'key_to_index': {'sentence': 0, 'this': 1, 'is': 2, 'a': 3, 'another': 4}
        >>> print(node_features)
        [[0, 0, 2], [1, 1, 2], [1, 1, 2], [1, 1, 2], [2, 1, 2]]
        >>> print(embeddings.shape)
        (5, 768)
    """
    index_to_key = {}
    key_to_index = {}
    node_types = []
    node_levels = []
    frequencies = []
    embeddings = []
    count = 0
    for node_type in ['target_node', 'similar_nodes', 'context_nodes']:
        for level in range(len(nodes[node_type])):
            for node in nodes[node_type][level]:
                index_to_key[count] = node
                key_to_index[node] = count
                count += 1 
                if node_type == 'target_node':
                    node_types.append(0)
                elif node_type == 'similar_nodes':
                    node_types.append(1)
                else:
                    node_types.append(2)
                node_levels.append(level)
                frequencies.append(count_occurence(self.dataset, node))
                embeddings.append(self.mlm.get_embedding(main_word=node).mean(axis=0))

    embeddings = np.array(embeddings)
    node_features = np.stack([node_types, node_levels, frequencies]).T
    # node_features = np.concatenate((node_features, embeddings), axis=1)

    index = {'index_to_key': index_to_key, 'key_to_index': key_to_index}
    return index, node_features, embeddings

`get_nodes()`

This method is used to get the nodes of the word graph (similar nodes, context nodes, and target node).

Returns:

Name	Type	Description
`nodes`	`Dict[str, List[str]]`	the nodes of the word graph

Source code in semantics/graphs/temporal_graph.py

def get_nodes(self) -> Dict[str, List[str]]:
    """
    This method is used to get the nodes of the word graph (similar nodes, context nodes, and target node).

    Returns:
        nodes (Dict[str, List[str]]): the nodes of the word graph
    """
    nodes = {'target_node': [], 'similar_nodes': [], 'context_nodes': []}
    for level in range(self.level):
        print(f'Getting the nodes of level {level} ...')
        if level == 0:
            similar_nodes = self.get_similar_nodes(self.target_word, keep_k= 5)
            context_nodes = self.get_context_nodes(self.target_word)

            nodes['similar_nodes'].append(similar_nodes)
            nodes['context_nodes'].append(context_nodes)
            nodes['target_node'].append([self.target_word])

        else:
            similar_nodes = []
            context_nodes = []
            for word in nodes['similar_nodes'][level-1]:
                similar_nodes += self.get_similar_nodes(word, keep_k= 5)
                context_nodes += self.get_context_nodes(word)


            for word in nodes['context_nodes'][level-1]:
                similar_nodes += self.get_similar_nodes(word, keep_k= 5)
                context_nodes += self.get_context_nodes(word)

            nodes['similar_nodes'].append(similar_nodes)
            nodes['context_nodes'].append(context_nodes)          
    return nodes

`get_similar_nodes(word, keep_k=50)`

This method is used to get the similar nodes of a word using the MLM model.

Parameters:

Name	Type	Description	Default
`word`	`str`	the word to get the similar nodes for	required
`keep_k`	`int`	the number of similar nodes to keep for each occurrence of the word. Default: 50.	`50`

Returns:

Name	Type	Description
`similar_nodes`	`List[str]`	the list of similar nodes of the word

Source code in semantics/graphs/temporal_graph.py

def get_similar_nodes(self, word: str, keep_k: int = 50) -> List[str]:
    """
    This method is used to get the similar nodes of a word using the MLM model.

    Args:
        word (str): the word to get the similar nodes for
        keep_k (int): the number of similar nodes to keep for each occurrence of the word. Default: 50.

    Returns:
        similar_nodes (List[str]): the list of similar nodes of the word
    """
    print(f'Getting the similar nodes of the word: ')
    progress_bar = tqdm.tqdm(
        range(len(self.dataset)), 
        desc=f"{word}", 
        dynamic_ncols=True
        )
    similar_nodes = []
    for sentence in self.dataset:
        similar_nodes += self.mlm.get_top_k_words(word, sentence, self.k)
        progress_bar.update(1)

    similar_nodes = list(set(similar_nodes))
    similar_nodes = most_frequent(similar_nodes, keep_k)
    print(f'{len(similar_nodes)} similar nodes found.', '\n')
    return similar_nodes

Nodes

__init__(target_word, dataset, level, k, c, word2vec_model, mlm_model)

get_context_nodes(word)

get_node_features(nodes)

get_nodes()

get_similar_nodes(word, keep_k=50)

`init(target_word, dataset, level, k, c, word2vec_model, mlm_model)`

`get_context_nodes(word)`

`get_node_features(nodes)`

`get_nodes()`

`get_similar_nodes(word, keep_k=50)`