Skip to content

Edges

This class is used to get the edges of the word graph.

Methods:

Name Description
__init__

Dict[int, str], node_features: np.ndarray, node_embeddings: np.ndarray) The constructor of the Edges class.

get_similarity

int, emb2: int) -> float This method is used to get the similarity between two nodes.

get_pmi

List[str], word1: str, word2: str) -> float This method is used to get the PMI between two nodes.

get_edge_features

List[str]) This method is used to get the edge features of the word graph.

Source code in semantics/graphs/temporal_graph.py
class Edges:
    """
    This class is used to get the edges of the word graph.

    methods:
        __init__(self, word_ids: Dict[int, str], node_features: np.ndarray, node_embeddings: np.ndarray)
            The constructor of the Edges class.
        get_similarity(self, emb1: int, emb2: int) -> float
            This method is used to get the similarity between two nodes.
        get_pmi(self, data: List[str], word1: str, word2: str) -> float
            This method is used to get the PMI between two nodes.
        get_edge_features(self, dataset: List[str])
            This method is used to get the edge features of the word graph.
    """
    def __init__(
            self,
            index_to_key: Dict[int, str],
            node_features: np.ndarray,
            node_embeddings: np.ndarray,
        ):
        """
        Args:
            index_to_key (Dict[int, str]): the index of the nodes of the word graph. The keys are the indices of the nodes and the values are the words of the nodes.
            node_features (np.ndarray): the features of the nodes of the word graph of shape (num_nodes, 3) where num_nodes is the number of nodes in the graph.
            node_embeddings (np.ndarray): the embeddings of the nodes of the word graph from the MLM model, of shape (num_nodes, 768).
        """

        self.index_to_key = index_to_key
        self.node_features = node_features
        self.node_embeddings = node_embeddings


    def get_similarity(self, emb1: int, emb2: int) -> float:
        """
        This method is used to get the similarity between two nodes.

        Args:
            emb1 (int): the first index of the embedding in node_embeddings
            emb2 (int): the second index of the embedding in node_embeddings

        Returns:
            similarity (float): the similarity between the two embeddings
        """
        # np.dot(node1, node2) / (np.linalg.norm(node1) * np.linalg.norm(node2))
        return torch.cosine_similarity(
            torch.tensor(self.node_embeddings[emb1]).reshape(1,-1), 
            torch.tensor(self.node_embeddings[emb2]).reshape(1,-1)
            ).item()


    def get_pmi(self, data: List[str], word1: str, word2: str) -> float:
        """
        This method is used to get the PMI between two nodes.

        Args:
            word1 (str): the first node (word)
            word2 (str): the second node (word)

        Returns:
            pmi (float): the PMI between the two words in the dataset
        """
        # Replace these methods with actual methods to get the word count, 
        # the co-occurrence count, and the total count.
        word1_count = count_occurence(data, word1)
        word2_count = count_occurence(data, word2)
        co_occurrence_count = count_occurence(data, [word1, word2])
        total_count = count_occurence(data)

        # Calculate probabilities
        p_word1 = word1_count / total_count
        p_word2 = word2_count / total_count
        p_co_occurrence = co_occurrence_count / total_count

        # Calculate PMI
        pmi = log(p_co_occurrence / (p_word1 * p_word2), 2) if p_co_occurrence > 0 else 0
        return pmi

    def get_edge_features(self, dataset: List[str], sim_threshold: float = 0.5):
        """
        This method is used to get the edge features of the word graph.

        Args:
            dataset (List[str]): the dataset to get the edge features from
            sim_threshold (float): the similarity threshold to create an edge between two nodes. Default: 0.5.

        Returns:
            edge_index (np.ndarray): the edge index of the word graph of shape (2, num_edges) where num_edges is the number of edges in the graph. The first row contains the indices of the first node of the edge and the second row contains the indices of the second node of the edge. An edge is created if the similarity between the two nodes is greater than sim_threshold.
            edge_features (np.ndarray): the edge features of the word graph of shape (num_edges, 3) where num_edges is the number of edges in the graph. The features are:

                - edge_type: the type of the edge (target-similar (1), target-context(2), similar-similar(3), similar-context(4), context-context(5), self-loop(0))

                - similarity: the similarity between node embeddings in the current snapshot

                - PMI: the PMI between nodes in the current snapshot

        """
        edge_index_1 = []
        edge_index_2 = []
        edge_types = []
        similarities = []
        pmis = []
        for word_idx1 in range(self.node_features.shape[0]):
            for word_idx2 in range(word_idx1, self.node_features.shape[0]):
                if word_idx1 == word_idx2:
                    edge_type = 0
                elif self.node_features[word_idx1][0] == 0 and self.node_features[word_idx2][0] == 1:
                    edge_type = 1
                elif self.node_features[word_idx1][0] == 0 and self.node_features[word_idx2][0] == 2:
                    edge_type = 2
                elif self.node_features[word_idx1][0] == 1 and self.node_features[word_idx2][0] == 1:
                    edge_type = 3
                elif self.node_features[word_idx1][0] == 1 and self.node_features[word_idx2][0] == 2:
                    edge_type = 4
                elif self.node_features[word_idx1][0] == 2 and self.node_features[word_idx2][0] == 2:
                    edge_type = 5

                similarity = self.get_similarity(word_idx1, word_idx2)
                pmi = self.get_pmi(dataset, self.index_to_key[word_idx1], self.index_to_key[word_idx2])

                if similarity > sim_threshold:
                    edge_index_1.append(word_idx1)
                    edge_index_2.append(word_idx2)
                    edge_types.append(edge_type)
                    similarities.append(similarity)
                    pmis.append(pmi)

        edge_index = np.stack([edge_index_1, edge_index_2])
        edge_features = np.stack([edge_types, similarities, pmis]).T

        return edge_index, edge_features

__init__(index_to_key, node_features, node_embeddings)

Parameters:

Name Type Description Default
index_to_key Dict[int, str]

the index of the nodes of the word graph. The keys are the indices of the nodes and the values are the words of the nodes.

required
node_features ndarray

the features of the nodes of the word graph of shape (num_nodes, 3) where num_nodes is the number of nodes in the graph.

required
node_embeddings ndarray

the embeddings of the nodes of the word graph from the MLM model, of shape (num_nodes, 768).

required
Source code in semantics/graphs/temporal_graph.py
def __init__(
        self,
        index_to_key: Dict[int, str],
        node_features: np.ndarray,
        node_embeddings: np.ndarray,
    ):
    """
    Args:
        index_to_key (Dict[int, str]): the index of the nodes of the word graph. The keys are the indices of the nodes and the values are the words of the nodes.
        node_features (np.ndarray): the features of the nodes of the word graph of shape (num_nodes, 3) where num_nodes is the number of nodes in the graph.
        node_embeddings (np.ndarray): the embeddings of the nodes of the word graph from the MLM model, of shape (num_nodes, 768).
    """

    self.index_to_key = index_to_key
    self.node_features = node_features
    self.node_embeddings = node_embeddings

get_edge_features(dataset, sim_threshold=0.5)

This method is used to get the edge features of the word graph.

Parameters:

Name Type Description Default
dataset List[str]

the dataset to get the edge features from

required
sim_threshold float

the similarity threshold to create an edge between two nodes. Default: 0.5.

0.5

Returns:

Name Type Description
edge_index ndarray

the edge index of the word graph of shape (2, num_edges) where num_edges is the number of edges in the graph. The first row contains the indices of the first node of the edge and the second row contains the indices of the second node of the edge. An edge is created if the similarity between the two nodes is greater than sim_threshold.

edge_features ndarray

the edge features of the word graph of shape (num_edges, 3) where num_edges is the number of edges in the graph. The features are:

  • edge_type: the type of the edge (target-similar (1), target-context(2), similar-similar(3), similar-context(4), context-context(5), self-loop(0))

  • similarity: the similarity between node embeddings in the current snapshot

  • PMI: the PMI between nodes in the current snapshot

Source code in semantics/graphs/temporal_graph.py
def get_edge_features(self, dataset: List[str], sim_threshold: float = 0.5):
    """
    This method is used to get the edge features of the word graph.

    Args:
        dataset (List[str]): the dataset to get the edge features from
        sim_threshold (float): the similarity threshold to create an edge between two nodes. Default: 0.5.

    Returns:
        edge_index (np.ndarray): the edge index of the word graph of shape (2, num_edges) where num_edges is the number of edges in the graph. The first row contains the indices of the first node of the edge and the second row contains the indices of the second node of the edge. An edge is created if the similarity between the two nodes is greater than sim_threshold.
        edge_features (np.ndarray): the edge features of the word graph of shape (num_edges, 3) where num_edges is the number of edges in the graph. The features are:

            - edge_type: the type of the edge (target-similar (1), target-context(2), similar-similar(3), similar-context(4), context-context(5), self-loop(0))

            - similarity: the similarity between node embeddings in the current snapshot

            - PMI: the PMI between nodes in the current snapshot

    """
    edge_index_1 = []
    edge_index_2 = []
    edge_types = []
    similarities = []
    pmis = []
    for word_idx1 in range(self.node_features.shape[0]):
        for word_idx2 in range(word_idx1, self.node_features.shape[0]):
            if word_idx1 == word_idx2:
                edge_type = 0
            elif self.node_features[word_idx1][0] == 0 and self.node_features[word_idx2][0] == 1:
                edge_type = 1
            elif self.node_features[word_idx1][0] == 0 and self.node_features[word_idx2][0] == 2:
                edge_type = 2
            elif self.node_features[word_idx1][0] == 1 and self.node_features[word_idx2][0] == 1:
                edge_type = 3
            elif self.node_features[word_idx1][0] == 1 and self.node_features[word_idx2][0] == 2:
                edge_type = 4
            elif self.node_features[word_idx1][0] == 2 and self.node_features[word_idx2][0] == 2:
                edge_type = 5

            similarity = self.get_similarity(word_idx1, word_idx2)
            pmi = self.get_pmi(dataset, self.index_to_key[word_idx1], self.index_to_key[word_idx2])

            if similarity > sim_threshold:
                edge_index_1.append(word_idx1)
                edge_index_2.append(word_idx2)
                edge_types.append(edge_type)
                similarities.append(similarity)
                pmis.append(pmi)

    edge_index = np.stack([edge_index_1, edge_index_2])
    edge_features = np.stack([edge_types, similarities, pmis]).T

    return edge_index, edge_features

get_pmi(data, word1, word2)

This method is used to get the PMI between two nodes.

Parameters:

Name Type Description Default
word1 str

the first node (word)

required
word2 str

the second node (word)

required

Returns:

Name Type Description
pmi float

the PMI between the two words in the dataset

Source code in semantics/graphs/temporal_graph.py
def get_pmi(self, data: List[str], word1: str, word2: str) -> float:
    """
    This method is used to get the PMI between two nodes.

    Args:
        word1 (str): the first node (word)
        word2 (str): the second node (word)

    Returns:
        pmi (float): the PMI between the two words in the dataset
    """
    # Replace these methods with actual methods to get the word count, 
    # the co-occurrence count, and the total count.
    word1_count = count_occurence(data, word1)
    word2_count = count_occurence(data, word2)
    co_occurrence_count = count_occurence(data, [word1, word2])
    total_count = count_occurence(data)

    # Calculate probabilities
    p_word1 = word1_count / total_count
    p_word2 = word2_count / total_count
    p_co_occurrence = co_occurrence_count / total_count

    # Calculate PMI
    pmi = log(p_co_occurrence / (p_word1 * p_word2), 2) if p_co_occurrence > 0 else 0
    return pmi

get_similarity(emb1, emb2)

This method is used to get the similarity between two nodes.

Parameters:

Name Type Description Default
emb1 int

the first index of the embedding in node_embeddings

required
emb2 int

the second index of the embedding in node_embeddings

required

Returns:

Name Type Description
similarity float

the similarity between the two embeddings

Source code in semantics/graphs/temporal_graph.py
def get_similarity(self, emb1: int, emb2: int) -> float:
    """
    This method is used to get the similarity between two nodes.

    Args:
        emb1 (int): the first index of the embedding in node_embeddings
        emb2 (int): the second index of the embedding in node_embeddings

    Returns:
        similarity (float): the similarity between the two embeddings
    """
    # np.dot(node1, node2) / (np.linalg.norm(node1) * np.linalg.norm(node2))
    return torch.cosine_similarity(
        torch.tensor(self.node_embeddings[emb1]).reshape(1,-1), 
        torch.tensor(self.node_embeddings[emb2]).reshape(1,-1)
        ).item()