Edges

This class is used to get the edges of the word graph.

Methods:

Name	Description
`__init__`	Dict[int, str], node_features: np.ndarray, node_embeddings: np.ndarray) The constructor of the Edges class.
`get_similarity`	int, emb2: int) -> float This method is used to get the similarity between two nodes.
`get_pmi`	List[str], word1: str, word2: str) -> float This method is used to get the PMI between two nodes.
`get_edge_features`	List[str]) This method is used to get the edge features of the word graph.

Source code in semantics/graphs/temporal_graph.py

class Edges:
    """
    This class is used to get the edges of the word graph.

    methods:
        __init__(self, word_ids: Dict[int, str], node_features: np.ndarray, node_embeddings: np.ndarray)
            The constructor of the Edges class.
        get_similarity(self, emb1: int, emb2: int) -> float
            This method is used to get the similarity between two nodes.
        get_pmi(self, data: List[str], word1: str, word2: str) -> float
            This method is used to get the PMI between two nodes.
        get_edge_features(self, dataset: List[str])
            This method is used to get the edge features of the word graph.
    """
    def __init__(
            self,
            index_to_key: Dict[int, str],
            node_features: np.ndarray,
            node_embeddings: np.ndarray,
        ):
        """
        Args:
            index_to_key (Dict[int, str]): the index of the nodes of the word graph. The keys are the indices of the nodes and the values are the words of the nodes.
            node_features (np.ndarray): the features of the nodes of the word graph of shape (num_nodes, 3) where num_nodes is the number of nodes in the graph.
            node_embeddings (np.ndarray): the embeddings of the nodes of the word graph from the MLM model, of shape (num_nodes, 768).
        """

        self.index_to_key = index_to_key
        self.node_features = node_features
        self.node_embeddings = node_embeddings


    def get_similarity(self, emb1: int, emb2: int) -> float:
        """
        This method is used to get the similarity between two nodes.

        Args:
            emb1 (int): the first index of the embedding in node_embeddings
            emb2 (int): the second index of the embedding in node_embeddings

        Returns:
            similarity (float): the similarity between the two embeddings
        """
        # np.dot(node1, node2) / (np.linalg.norm(node1) * np.linalg.norm(node2))
        return torch.cosine_similarity(
            torch.tensor(self.node_embeddings[emb1]).reshape(1,-1), 
            torch.tensor(self.node_embeddings[emb2]).reshape(1,-1)
            ).item()


    def get_pmi(self, data: List[str], word1: str, word2: str) -> float:
        """
        This method is used to get the PMI between two nodes.

        Args:
            word1 (str): the first node (word)
            word2 (str): the second node (word)

        Returns:
            pmi (float): the PMI between the two words in the dataset
        """
        # Replace these methods with actual methods to get the word count, 
        # the co-occurrence count, and the total count.
        word1_count = count_occurence(data, word1)
        word2_count = count_occurence(data, word2)
        co_occurrence_count = count_occurence(data, [word1, word2])
        total_count = count_occurence(data)

        # Calculate probabilities
        p_word1 = word1_count / total_count
        p_word2 = word2_count / total_count
        p_co_occurrence = co_occurrence_count / total_count

        # Calculate PMI
        pmi = log(p_co_occurrence / (p_word1 * p_word2), 2) if p_co_occurrence > 0 else 0
        return pmi

    def get_edge_features(self, dataset: List[str], sim_threshold: float = 0.5):
        """
        This method is used to get the edge features of the word graph.

        Args:
            dataset (List[str]): the dataset to get the edge features from
            sim_threshold (float): the similarity threshold to create an edge between two nodes. Default: 0.5.

        Returns:
            edge_index (np.ndarray): the edge index of the word graph of shape (2, num_edges) where num_edges is the number of edges in the graph. The first row contains the indices of the first node of the edge and the second row contains the indices of the second node of the edge. An edge is created if the similarity between the two nodes is greater than sim_threshold.
            edge_features (np.ndarray): the edge features of the word graph of shape (num_edges, 3) where num_edges is the number of edges in the graph. The features are:

                - edge_type: the type of the edge (target-similar (1), target-context(2), similar-similar(3), similar-context(4), context-context(5), self-loop(0))

                - similarity: the similarity between node embeddings in the current snapshot

                - PMI: the PMI between nodes in the current snapshot

        """
        edge_index_1 = []
        edge_index_2 = []
        edge_types = []
        similarities = []
        pmis = []
        for word_idx1 in range(self.node_features.shape[0]):
            for word_idx2 in range(word_idx1, self.node_features.shape[0]):
                if word_idx1 == word_idx2:
                    edge_type = 0
                elif self.node_features[word_idx1][0] == 0 and self.node_features[word_idx2][0] == 1:
                    edge_type = 1
                elif self.node_features[word_idx1][0] == 0 and self.node_features[word_idx2][0] == 2:
                    edge_type = 2
                elif self.node_features[word_idx1][0] == 1 and self.node_features[word_idx2][0] == 1:
                    edge_type = 3
                elif self.node_features[word_idx1][0] == 1 and self.node_features[word_idx2][0] == 2:
                    edge_type = 4
                elif self.node_features[word_idx1][0] == 2 and self.node_features[word_idx2][0] == 2:
                    edge_type = 5

                similarity = self.get_similarity(word_idx1, word_idx2)
                pmi = self.get_pmi(dataset, self.index_to_key[word_idx1], self.index_to_key[word_idx2])

                if similarity > sim_threshold:
                    edge_index_1.append(word_idx1)
                    edge_index_2.append(word_idx2)
                    edge_types.append(edge_type)
                    similarities.append(similarity)
                    pmis.append(pmi)

        edge_index = np.stack([edge_index_1, edge_index_2])
        edge_features = np.stack([edge_types, similarities, pmis]).T

        return edge_index, edge_features

`init(index_to_key, node_features, node_embeddings)`

Parameters:

Name	Type	Description	Default
`index_to_key`	`Dict[int, str]`	the index of the nodes of the word graph. The keys are the indices of the nodes and the values are the words of the nodes.	required
`node_features`	`ndarray`	the features of the nodes of the word graph of shape (num_nodes, 3) where num_nodes is the number of nodes in the graph.	required
`node_embeddings`	`ndarray`	the embeddings of the nodes of the word graph from the MLM model, of shape (num_nodes, 768).	required

Source code in semantics/graphs/temporal_graph.py

def __init__(
        self,
        index_to_key: Dict[int, str],
        node_features: np.ndarray,
        node_embeddings: np.ndarray,
    ):
    """
    Args:
        index_to_key (Dict[int, str]): the index of the nodes of the word graph. The keys are the indices of the nodes and the values are the words of the nodes.
        node_features (np.ndarray): the features of the nodes of the word graph of shape (num_nodes, 3) where num_nodes is the number of nodes in the graph.
        node_embeddings (np.ndarray): the embeddings of the nodes of the word graph from the MLM model, of shape (num_nodes, 768).
    """

    self.index_to_key = index_to_key
    self.node_features = node_features
    self.node_embeddings = node_embeddings

`get_edge_features(dataset, sim_threshold=0.5)`

This method is used to get the edge features of the word graph.

Parameters:

Name	Type	Description	Default
`dataset`	`List[str]`	the dataset to get the edge features from	required
`sim_threshold`	`float`	the similarity threshold to create an edge between two nodes. Default: 0.5.	`0.5`

Returns:

Name	Type	Description
`edge_index`	`ndarray`	the edge index of the word graph of shape (2, num_edges) where num_edges is the number of edges in the graph. The first row contains the indices of the first node of the edge and the second row contains the indices of the second node of the edge. An edge is created if the similarity between the two nodes is greater than sim_threshold.
`edge_features`	`ndarray`	the edge features of the word graph of shape (num_edges, 3) where num_edges is the number of edges in the graph. The features are: edge_type: the type of the edge (target-similar (1), target-context(2), similar-similar(3), similar-context(4), context-context(5), self-loop(0)) similarity: the similarity between node embeddings in the current snapshot PMI: the PMI between nodes in the current snapshot

Source code in semantics/graphs/temporal_graph.py

def get_edge_features(self, dataset: List[str], sim_threshold: float = 0.5):
    """
    This method is used to get the edge features of the word graph.

    Args:
        dataset (List[str]): the dataset to get the edge features from
        sim_threshold (float): the similarity threshold to create an edge between two nodes. Default: 0.5.

    Returns:
        edge_index (np.ndarray): the edge index of the word graph of shape (2, num_edges) where num_edges is the number of edges in the graph. The first row contains the indices of the first node of the edge and the second row contains the indices of the second node of the edge. An edge is created if the similarity between the two nodes is greater than sim_threshold.
        edge_features (np.ndarray): the edge features of the word graph of shape (num_edges, 3) where num_edges is the number of edges in the graph. The features are:

            - edge_type: the type of the edge (target-similar (1), target-context(2), similar-similar(3), similar-context(4), context-context(5), self-loop(0))

            - similarity: the similarity between node embeddings in the current snapshot

            - PMI: the PMI between nodes in the current snapshot

    """
    edge_index_1 = []
    edge_index_2 = []
    edge_types = []
    similarities = []
    pmis = []
    for word_idx1 in range(self.node_features.shape[0]):
        for word_idx2 in range(word_idx1, self.node_features.shape[0]):
            if word_idx1 == word_idx2:
                edge_type = 0
            elif self.node_features[word_idx1][0] == 0 and self.node_features[word_idx2][0] == 1:
                edge_type = 1
            elif self.node_features[word_idx1][0] == 0 and self.node_features[word_idx2][0] == 2:
                edge_type = 2
            elif self.node_features[word_idx1][0] == 1 and self.node_features[word_idx2][0] == 1:
                edge_type = 3
            elif self.node_features[word_idx1][0] == 1 and self.node_features[word_idx2][0] == 2:
                edge_type = 4
            elif self.node_features[word_idx1][0] == 2 and self.node_features[word_idx2][0] == 2:
                edge_type = 5

            similarity = self.get_similarity(word_idx1, word_idx2)
            pmi = self.get_pmi(dataset, self.index_to_key[word_idx1], self.index_to_key[word_idx2])

            if similarity > sim_threshold:
                edge_index_1.append(word_idx1)
                edge_index_2.append(word_idx2)
                edge_types.append(edge_type)
                similarities.append(similarity)
                pmis.append(pmi)

    edge_index = np.stack([edge_index_1, edge_index_2])
    edge_features = np.stack([edge_types, similarities, pmis]).T

    return edge_index, edge_features

`get_pmi(data, word1, word2)`

This method is used to get the PMI between two nodes.

Parameters:

Name	Type	Description	Default
`word1`	`str`	the first node (word)	required
`word2`	`str`	the second node (word)	required

Returns:

Name	Type	Description
`pmi`	`float`	the PMI between the two words in the dataset

Source code in semantics/graphs/temporal_graph.py

def get_pmi(self, data: List[str], word1: str, word2: str) -> float:
    """
    This method is used to get the PMI between two nodes.

    Args:
        word1 (str): the first node (word)
        word2 (str): the second node (word)

    Returns:
        pmi (float): the PMI between the two words in the dataset
    """
    # Replace these methods with actual methods to get the word count, 
    # the co-occurrence count, and the total count.
    word1_count = count_occurence(data, word1)
    word2_count = count_occurence(data, word2)
    co_occurrence_count = count_occurence(data, [word1, word2])
    total_count = count_occurence(data)

    # Calculate probabilities
    p_word1 = word1_count / total_count
    p_word2 = word2_count / total_count
    p_co_occurrence = co_occurrence_count / total_count

    # Calculate PMI
    pmi = log(p_co_occurrence / (p_word1 * p_word2), 2) if p_co_occurrence > 0 else 0
    return pmi

`get_similarity(emb1, emb2)`

This method is used to get the similarity between two nodes.

Parameters:

Name	Type	Description	Default
`emb1`	`int`	the first index of the embedding in node_embeddings	required
`emb2`	`int`	the second index of the embedding in node_embeddings	required

Returns:

Name	Type	Description
`similarity`	`float`	the similarity between the two embeddings

Source code in semantics/graphs/temporal_graph.py

def get_similarity(self, emb1: int, emb2: int) -> float:
    """
    This method is used to get the similarity between two nodes.

    Args:
        emb1 (int): the first index of the embedding in node_embeddings
        emb2 (int): the second index of the embedding in node_embeddings

    Returns:
        similarity (float): the similarity between the two embeddings
    """
    # np.dot(node1, node2) / (np.linalg.norm(node1) * np.linalg.norm(node2))
    return torch.cosine_similarity(
        torch.tensor(self.node_embeddings[emb1]).reshape(1,-1), 
        torch.tensor(self.node_embeddings[emb2]).reshape(1,-1)
        ).item()

Edges

__init__(index_to_key, node_features, node_embeddings)

get_edge_features(dataset, sim_threshold=0.5)

get_pmi(data, word1, word2)

get_similarity(emb1, emb2)

`init(index_to_key, node_features, node_embeddings)`

`get_edge_features(dataset, sim_threshold=0.5)`

`get_pmi(data, word1, word2)`

`get_similarity(emb1, emb2)`