Word2Vec

`Word2VecAlign`

Wrapper class for gensim.models.Word2Vec to align Word2Vec models.

Methods

__init__(model_paths)
    Initialize the Word2VecAlign object with a list of paths to the Word2Vec models.
load_models()
    Load the models
align_models(reference_index, output_dir, method)
    Align the models

Source code in semantics/feature_extraction/word2vec.py

class Word2VecAlign:
    """
    Wrapper class for gensim.models.Word2Vec to align Word2Vec models.

    Methods
    -------
        __init__(model_paths)
            Initialize the Word2VecAlign object with a list of paths to the Word2Vec models.
        load_models()
            Load the models
        align_models(reference_index, output_dir, method)
            Align the models
    """
    def __init__(
            self, 
            model_paths: List[str],

            ):
        """
        Args:
            model_paths (List[str]): List of paths to the models 

        Attributes:
            model_paths (List[str]): List of paths to the models 
            reference_model (gensim.models.Word2Vec): The reference model
            models (List[gensim.models.Word2Vec]): List of models
            model_names (List[str]): List of model names
            aligned_models (List[gensim.models.Word2Vec]): List of aligned models     
        """
        self.model_paths = model_paths
        self.reference_model = None
        self.models = []
        self.model_names = [Path(model_path).stem for model_path in model_paths]
        self.aligned_models = []

        self.load_models()

    def load_models(self) -> None:
        """
        Load the models
        """
        for model_path in self.model_paths:
            self.models.append(Word2Vec.load(model_path))

    def align_models(
            self,
            reference_index: int = -1,
            output_dir: Optional[str] = None,
            method: str = "procrustes",
            ) -> List[Word2Vec]:
        """
        Align the models

        Args: 
            reference_index (int, optional): Index of the reference model, by default -1
            output_dir (str, optional): Path to save the aligned models, by default None
            method (str, optional): Alignment method, by default "procrustes"

        Returns:
            aligned_models (List[gensim.models.Word2Vec]): List of aligned models

        Examples:
            >>> from semantics.feature_extraction.word2vec import Word2VecAlign
            >>> model_paths = ['model1.model', 'model2.model']
            >>> Word2VecAlign(model_paths).align_models(reference_index=0, output_dir='aligned_models')
            >>> print('Aligned models: ', Word2VecAlign(model_paths).aligned_models)
            Aligned models:  [Word2Vec(vocab=5, vector_size=100, alpha=0.025), Word2Vec(vocab=5, vector_size=100, alpha=0.025)]
        """

        if method != "procrustes":
            raise NotImplementedError("Only procrustes alignment is implemented. Please use method='procrustes'")


        self.reference_model = self.models[reference_index]
        self.reference_model.save(f"{output_dir}/{self.model_names[reference_index]}_aligned.model")
        self.aligned_models.append(self.reference_model)
        self.models.pop(reference_index)

        for i, model in enumerate(self.models):
            aligned_model = smart_procrustes_align_gensim(self.reference_model,model)
            aligned_model.save(f"{output_dir}/{self.model_names[i]}_aligned.model")
            self.aligned_models.append(aligned_model)

        return self.aligned_models

`init(model_paths)`

Parameters:

Name	Type	Description	Default
`model_paths`	`List[str]`	List of paths to the models	required

Attributes:

Name	Type	Description
`model_paths`	`List[str]`	List of paths to the models
`reference_model`	`Word2Vec`	The reference model
`models`	`List[Word2Vec]`	List of models
`model_names`	`List[str]`	List of model names
`aligned_models`	`List[Word2Vec]`	List of aligned models

Source code in semantics/feature_extraction/word2vec.py

def __init__(
        self, 
        model_paths: List[str],

        ):
    """
    Args:
        model_paths (List[str]): List of paths to the models 

    Attributes:
        model_paths (List[str]): List of paths to the models 
        reference_model (gensim.models.Word2Vec): The reference model
        models (List[gensim.models.Word2Vec]): List of models
        model_names (List[str]): List of model names
        aligned_models (List[gensim.models.Word2Vec]): List of aligned models     
    """
    self.model_paths = model_paths
    self.reference_model = None
    self.models = []
    self.model_names = [Path(model_path).stem for model_path in model_paths]
    self.aligned_models = []

    self.load_models()

`align_models(reference_index=-1, output_dir=None, method='procrustes')`

Align the models

Parameters:

Name	Type	Description	Default
`reference_index`	`int`	Index of the reference model, by default -1	`-1`
`output_dir`	`str`	Path to save the aligned models, by default None	`None`
`method`	`str`	Alignment method, by default "procrustes"	`'procrustes'`

Returns:

Name	Type	Description
`aligned_models`	`List[Word2Vec]`	List of aligned models

Examples:

>>> from semantics.feature_extraction.word2vec import Word2VecAlign
>>> model_paths = ['model1.model', 'model2.model']
>>> Word2VecAlign(model_paths).align_models(reference_index=0, output_dir='aligned_models')
>>> print('Aligned models: ', Word2VecAlign(model_paths).aligned_models)
Aligned models:  [Word2Vec(vocab=5, vector_size=100, alpha=0.025), Word2Vec(vocab=5, vector_size=100, alpha=0.025)]

Source code in semantics/feature_extraction/word2vec.py

def align_models(
        self,
        reference_index: int = -1,
        output_dir: Optional[str] = None,
        method: str = "procrustes",
        ) -> List[Word2Vec]:
    """
    Align the models

    Args: 
        reference_index (int, optional): Index of the reference model, by default -1
        output_dir (str, optional): Path to save the aligned models, by default None
        method (str, optional): Alignment method, by default "procrustes"

    Returns:
        aligned_models (List[gensim.models.Word2Vec]): List of aligned models

    Examples:
        >>> from semantics.feature_extraction.word2vec import Word2VecAlign
        >>> model_paths = ['model1.model', 'model2.model']
        >>> Word2VecAlign(model_paths).align_models(reference_index=0, output_dir='aligned_models')
        >>> print('Aligned models: ', Word2VecAlign(model_paths).aligned_models)
        Aligned models:  [Word2Vec(vocab=5, vector_size=100, alpha=0.025), Word2Vec(vocab=5, vector_size=100, alpha=0.025)]
    """

    if method != "procrustes":
        raise NotImplementedError("Only procrustes alignment is implemented. Please use method='procrustes'")


    self.reference_model = self.models[reference_index]
    self.reference_model.save(f"{output_dir}/{self.model_names[reference_index]}_aligned.model")
    self.aligned_models.append(self.reference_model)
    self.models.pop(reference_index)

    for i, model in enumerate(self.models):
        aligned_model = smart_procrustes_align_gensim(self.reference_model,model)
        aligned_model.save(f"{output_dir}/{self.model_names[i]}_aligned.model")
        self.aligned_models.append(aligned_model)

    return self.aligned_models

`load_models()`

Load the models

Source code in semantics/feature_extraction/word2vec.py

def load_models(self) -> None:
    """
    Load the models
    """
    for model_path in self.model_paths:
        self.models.append(Word2Vec.load(model_path))

`Word2VecEmbeddings`

Wrapper class for gensim.models.Word2Vec to infer word vectors.

Methods

__init__(pretrained_model_path)
    Initialize the Word2VecEmbeddings object with a pretrained model.
_word2vec_case_preparation()
    Prepare the Word2Vec model
infer_vector(word, norm)
    Infer the vector of a word

Source code in semantics/feature_extraction/word2vec.py

class Word2VecEmbeddings:
    """
    Wrapper class for gensim.models.Word2Vec to infer word vectors.

    Methods
    -------
        __init__(pretrained_model_path)
            Initialize the Word2VecEmbeddings object with a pretrained model.
        _word2vec_case_preparation()
            Prepare the Word2Vec model
        infer_vector(word, norm)
            Infer the vector of a word
    """
    def __init__(
            self,
            pretrained_model_path: Optional[str] = None,
            ):
        """
        Args: 
            pretrained_model_path (str, optional): Path to a pretrained model, by default None

        Attributes:
            model_path (str, optional): Path to a pretrained model, by default None
            model (gensim.models.Word2Vec): The Word2Vec model
            vocab (bool): Whether the model has been initialized
        """
        self.model_path = pretrained_model_path
        if pretrained_model_path is not None:
            if not os.path.exists(pretrained_model_path):
                raise ValueError(
                    f"Model path {pretrained_model_path} does not exist."
                )
            self.model_path = pretrained_model_path

        self.model = None
        self.vocab = False

        self._word2vec_case_preparation()

    def _word2vec_case_preparation(self) -> None:
        """
        Initialize the Word2Vec model
        """
        if self.model_path is None:
            self.model = Word2Vec()
        else:
            self.model = Word2Vec.load(self.model_path)
        self.vocab = True

    def infer_vector(self, word:str, norm = False) -> List[float]:
        """
        Infer the vector of a word

        Args:
            word (str): The word to infer the embedding vector of
            norm (bool, optional): Whether to normalize the vector, by default False

        Returns:
            embedding (List[float]): The embedding vector of the word
        """
        if not self.vocab:
            raise ValueError(
                f'The Embedding model {self.model.__class__.__name__} has not been initialized'
            )
        return self.model.wv.get_vector(word, norm = norm)

`init(pretrained_model_path=None)`

Parameters:

Name	Type	Description	Default
`pretrained_model_path`	`str`	Path to a pretrained model, by default None	`None`

Attributes:

Name	Type	Description
`model_path`	`str`	Path to a pretrained model, by default None
`model`	`Word2Vec`	The Word2Vec model
`vocab`	`bool`	Whether the model has been initialized

Source code in semantics/feature_extraction/word2vec.py

def __init__(
        self,
        pretrained_model_path: Optional[str] = None,
        ):
    """
    Args: 
        pretrained_model_path (str, optional): Path to a pretrained model, by default None

    Attributes:
        model_path (str, optional): Path to a pretrained model, by default None
        model (gensim.models.Word2Vec): The Word2Vec model
        vocab (bool): Whether the model has been initialized
    """
    self.model_path = pretrained_model_path
    if pretrained_model_path is not None:
        if not os.path.exists(pretrained_model_path):
            raise ValueError(
                f"Model path {pretrained_model_path} does not exist."
            )
        self.model_path = pretrained_model_path

    self.model = None
    self.vocab = False

    self._word2vec_case_preparation()

`infer_vector(word, norm=False)`

Infer the vector of a word

Parameters:

Name	Type	Description	Default
`word`	`str`	The word to infer the embedding vector of	required
`norm`	`bool`	Whether to normalize the vector, by default False	`False`

Returns:

Name	Type	Description
`embedding`	`List[float]`	The embedding vector of the word

Source code in semantics/feature_extraction/word2vec.py

def infer_vector(self, word:str, norm = False) -> List[float]:
    """
    Infer the vector of a word

    Args:
        word (str): The word to infer the embedding vector of
        norm (bool, optional): Whether to normalize the vector, by default False

    Returns:
        embedding (List[float]): The embedding vector of the word
    """
    if not self.vocab:
        raise ValueError(
            f'The Embedding model {self.model.__class__.__name__} has not been initialized'
        )
    return self.model.wv.get_vector(word, norm = norm)

`Word2VecInference`

Wrapper class for gensim.models.Word2Vec for Inference.

Methods

__init__(pretrained_model_path)
    Initialize the Word2VecInference object with a pretrained model.
get_embedding(word, norm)
    Infer the vector of a word
get_similarity(word1, word2)
    Get the cosine similarity between two words
get_top_k_words(word, k)
    Get the top k most similar words to a word in the vocabulary of the model.

Source code in semantics/feature_extraction/word2vec.py

class Word2VecInference:
    """
    Wrapper class for gensim.models.Word2Vec for Inference.

    Methods
    -------
        __init__(pretrained_model_path)
            Initialize the Word2VecInference object with a pretrained model.
        get_embedding(word, norm)
            Infer the vector of a word
        get_similarity(word1, word2)
            Get the cosine similarity between two words
        get_top_k_words(word, k)
            Get the top k most similar words to a word in the vocabulary of the model.
    """
    def __init__(
            self,
            pretrained_model_path: Optional[str] = None,
            ):
        """
        Args:
            pretrained_model_path (str, optional): Path to a pretrained model, by default None  

        Attributes:
            word_vectorizor (Word2VecEmbeddings): The Word2VecEmbeddings object
        """
        self.word_vectorizor = Word2VecEmbeddings(pretrained_model_path)

    def get_embedding(self, word:str, norm: bool = False) -> List[float]:
        """
        Infer the vector of a word

        Args:
            word (str): The word to infer the embedding vector of
            norm (bool, optional): Whether to normalize the vector, by default False

        Returns:
            embedding (List[float]): The embedding vector of the word

        Examples:
            >>> from semantics.feature_extraction.word2vec import Word2VecInference
            >>> Word2VecInference('model.model').get_embedding('test', norm=False)
            array([-0.00460768, -0.00460768, ..., -0.00460768, -0.00460768])
        """
        return self.word_vectorizor.infer_vector(word= word, norm = norm)

    def get_similarity(self, word1: str, word2: str) -> float:
        """
        Get the cosine similarity between two words' embedding vectors

        Args:
            word1 (str): The first word
            word2 (str): The second word

        Returns:
            similarity (float): The cosine similarity between the two words

        Examples:
            >>> from semantics.feature_extraction.word2vec import Word2VecInference
            >>> Word2VecInference('model.model').get_similarity('test', 'another')
            0.99999994
        """
        return self.word_vectorizor.model.wv.similarity(word1, word2)

    def get_top_k_words(
            self,
            word: str,
            k: int = 10,
            ):
        """
        Get the top k most similar words to a word in the vocabulary of the model. Default k = 10

        Args:
            word (str): The word to get the top k most similar words of
            k (int, optional): The number of words to return, by default 10

        Returns:
            topk (Tuple[List[str], List[float]]): Tuple of lists of the top k most similar words and their cosine similarities

        Examples:
            >>> from semantics.feature_extraction.word2vec import Word2VecInference
            >>> Word2VecInference('model.model').get_top_k_words('test', k=1)
            (['another'], [0.9999999403953552])
        """

        try:
            sims = self.word_vectorizor.model.wv.most_similar(
                word,
                topn=k
                )
            return tuple(map(list, zip(*sims)))

        except KeyError:
            print(f"The word {word} in the input is not in the model vocabulary.")
            return [], []

`init(pretrained_model_path=None)`

Parameters:

Name	Type	Description	Default
`pretrained_model_path`	`str`	Path to a pretrained model, by default None	`None`

Attributes:

Name	Type	Description
`word_vectorizor`	`Word2VecEmbeddings`	The Word2VecEmbeddings object

Source code in semantics/feature_extraction/word2vec.py

def __init__(
        self,
        pretrained_model_path: Optional[str] = None,
        ):
    """
    Args:
        pretrained_model_path (str, optional): Path to a pretrained model, by default None  

    Attributes:
        word_vectorizor (Word2VecEmbeddings): The Word2VecEmbeddings object
    """
    self.word_vectorizor = Word2VecEmbeddings(pretrained_model_path)

`get_embedding(word, norm=False)`

Infer the vector of a word

Parameters:

Name	Type	Description	Default
`word`	`str`	The word to infer the embedding vector of	required
`norm`	`bool`	Whether to normalize the vector, by default False	`False`

Returns:

Name	Type	Description
`embedding`	`List[float]`	The embedding vector of the word

Examples:

>>> from semantics.feature_extraction.word2vec import Word2VecInference
>>> Word2VecInference('model.model').get_embedding('test', norm=False)
array([-0.00460768, -0.00460768, ..., -0.00460768, -0.00460768])

Source code in semantics/feature_extraction/word2vec.py

def get_embedding(self, word:str, norm: bool = False) -> List[float]:
    """
    Infer the vector of a word

    Args:
        word (str): The word to infer the embedding vector of
        norm (bool, optional): Whether to normalize the vector, by default False

    Returns:
        embedding (List[float]): The embedding vector of the word

    Examples:
        >>> from semantics.feature_extraction.word2vec import Word2VecInference
        >>> Word2VecInference('model.model').get_embedding('test', norm=False)
        array([-0.00460768, -0.00460768, ..., -0.00460768, -0.00460768])
    """
    return self.word_vectorizor.infer_vector(word= word, norm = norm)

`get_similarity(word1, word2)`

Get the cosine similarity between two words' embedding vectors

Parameters:

Name	Type	Description	Default
`word1`	`str`	The first word	required
`word2`	`str`	The second word	required

Returns:

Name	Type	Description
`similarity`	`float`	The cosine similarity between the two words

Examples:

>>> from semantics.feature_extraction.word2vec import Word2VecInference
>>> Word2VecInference('model.model').get_similarity('test', 'another')
0.99999994

Source code in semantics/feature_extraction/word2vec.py

def get_similarity(self, word1: str, word2: str) -> float:
    """
    Get the cosine similarity between two words' embedding vectors

    Args:
        word1 (str): The first word
        word2 (str): The second word

    Returns:
        similarity (float): The cosine similarity between the two words

    Examples:
        >>> from semantics.feature_extraction.word2vec import Word2VecInference
        >>> Word2VecInference('model.model').get_similarity('test', 'another')
        0.99999994
    """
    return self.word_vectorizor.model.wv.similarity(word1, word2)

`get_top_k_words(word, k=10)`

Get the top k most similar words to a word in the vocabulary of the model. Default k = 10

Parameters:

Name	Type	Description	Default
`word`	`str`	The word to get the top k most similar words of	required
`k`	`int`	The number of words to return, by default 10	`10`

Returns:

Name	Type	Description
`topk`	`Tuple[List[str], List[float]]`	Tuple of lists of the top k most similar words and their cosine similarities

Examples:

>>> from semantics.feature_extraction.word2vec import Word2VecInference
>>> Word2VecInference('model.model').get_top_k_words('test', k=1)
(['another'], [0.9999999403953552])

Source code in semantics/feature_extraction/word2vec.py

def get_top_k_words(
        self,
        word: str,
        k: int = 10,
        ):
    """
    Get the top k most similar words to a word in the vocabulary of the model. Default k = 10

    Args:
        word (str): The word to get the top k most similar words of
        k (int, optional): The number of words to return, by default 10

    Returns:
        topk (Tuple[List[str], List[float]]): Tuple of lists of the top k most similar words and their cosine similarities

    Examples:
        >>> from semantics.feature_extraction.word2vec import Word2VecInference
        >>> Word2VecInference('model.model').get_top_k_words('test', k=1)
        (['another'], [0.9999999403953552])
    """

    try:
        sims = self.word_vectorizor.model.wv.most_similar(
            word,
            topn=k
            )
        return tuple(map(list, zip(*sims)))

    except KeyError:
        print(f"The word {word} in the input is not in the model vocabulary.")
        return [], []

`Word2VecTrainer`

Wrapper class for gensim.models.Word2Vec to train a Word2Vec model.

Methods

__init__(model_path, min_count, window, negative, ns_exponent, vector_size, workers, sg, **kwargs)
    Initialize the Word2Vec model
train(data, output_path, epochs, start_alpha, end_alpha, compute_loss, **kwargs)
    Train the Word2Vec model on the given data

Source code in semantics/feature_extraction/word2vec.py

class Word2VecTrainer:  
    """
    Wrapper class for gensim.models.Word2Vec to train a Word2Vec model.

    Methods
    -------
        __init__(model_path, min_count, window, negative, ns_exponent, vector_size, workers, sg, **kwargs)
            Initialize the Word2Vec model
        train(data, output_path, epochs, start_alpha, end_alpha, compute_loss, **kwargs)
            Train the Word2Vec model on the given data
    """  

    def __init__(
            self,
            model_path: Optional[str] = None,
            min_count=0,
            window=15,
            negative=5,
            ns_exponent=0.75,
            vector_size=100,
            workers=1,
            sg=1,
            **kwargs
            ):
        """
        Args:
            model_path (str, optional): Path to a pretrained model, by default None.
            min_count (int, optional): Ignores all words with total frequency lower than this, by default 0
            window (int, optional): The maximum distance between the current and predicted word within a sentence, by default 15
            negative (int, optional): If > 0, negative sampling will be used, by default 5
            ns_exponent (float, optional): The exponent used to shape the negative sampling distribution, by default 0.75
            vector_size (int, optional): Dimensionality of the word vectors, by default 100
            workers (int, optional): Number of worker threads to train the model, by default 1
            sg (int, optional): Training algorithm: 1 for skip-gram; otherwise CBOW, by default 1
            **kwargs (optional): Additional arguments to pass to the gensim.models.Word2Vec constructor

        Attributes:
            model (gensim.models.Word2Vec): The Word2Vec model
        """

        if model_path:
            self.model = Word2Vec.load(model_path)
        else:
            self.model = Word2Vec(
                    min_count=min_count,
                    window=window,
                    negative=negative,
                    ns_exponent=ns_exponent,
                    vector_size=vector_size,
                    workers=workers,
                    sg=sg,
                    **kwargs
                    )

    def train(
            self, 
            data: List[str],
            output_dir: Optional[Union[str, Path]] = None,
            epochs=5,
            start_alpha=0.025,
            end_alpha=0.0001,
            compute_loss=True,
            **kwargs
            ):
        """
        Train the Word2Vec model on the given data

        Args:
            data (List[str]): List of documents
            output_dir (Union[str, Path], None): Path to save the trained model, by default None
            epochs (int, optional): Number of epochs, by default 5
            start_alpha (float, optional): Learning rate, by default 0.025
            end_alpha (float, optional): Minimum learning rate, by default 0.0001
            compute_loss (bool, optional): Whether to compute the loss, by default True
            **kwargs : optional

        Examples:
            >>> from semantics.feature_extraction.word2vec import Word2VecTrainer
            >>> texts = ['This is a test.', 'This is another test.', 'This is a third test.']
            >>> Word2VecTrainer().train(texts, epochs=1)
            >>> print('Trained model: ', Word2VecTrainer().model)
            Trained model:  Word2Vec(vocab=5, vector_size=100, alpha=0.025)
        """
        self.model.build_vocab(data)
        total_examples = self.model.corpus_count
        self.model.train(
                data,
                total_examples=total_examples,
                epochs=epochs,
                start_alpha=start_alpha,
                end_alpha=end_alpha,
                compute_loss=compute_loss,
                **kwargs
                )
        if output_dir:
            self.model.save(output_dir)

`init(model_path=None, min_count=0, window=15, negative=5, ns_exponent=0.75, vector_size=100, workers=1, sg=1, **kwargs)`

Parameters:

Name	Type	Description	Default
`model_path`	`str`	Path to a pretrained model, by default None.	`None`
`min_count`	`int`	Ignores all words with total frequency lower than this, by default 0	`0`
`window`	`int`	The maximum distance between the current and predicted word within a sentence, by default 15	`15`
`negative`	`int`	If > 0, negative sampling will be used, by default 5	`5`
`ns_exponent`	`float`	The exponent used to shape the negative sampling distribution, by default 0.75	`0.75`
`vector_size`	`int`	Dimensionality of the word vectors, by default 100	`100`
`workers`	`int`	Number of worker threads to train the model, by default 1	`1`
`sg`	`int`	Training algorithm: 1 for skip-gram; otherwise CBOW, by default 1	`1`
`**kwargs`	`optional`	Additional arguments to pass to the gensim.models.Word2Vec constructor	`{}`

Attributes:

Name	Type	Description
`model`	`Word2Vec`	The Word2Vec model

Source code in semantics/feature_extraction/word2vec.py

def __init__(
        self,
        model_path: Optional[str] = None,
        min_count=0,
        window=15,
        negative=5,
        ns_exponent=0.75,
        vector_size=100,
        workers=1,
        sg=1,
        **kwargs
        ):
    """
    Args:
        model_path (str, optional): Path to a pretrained model, by default None.
        min_count (int, optional): Ignores all words with total frequency lower than this, by default 0
        window (int, optional): The maximum distance between the current and predicted word within a sentence, by default 15
        negative (int, optional): If > 0, negative sampling will be used, by default 5
        ns_exponent (float, optional): The exponent used to shape the negative sampling distribution, by default 0.75
        vector_size (int, optional): Dimensionality of the word vectors, by default 100
        workers (int, optional): Number of worker threads to train the model, by default 1
        sg (int, optional): Training algorithm: 1 for skip-gram; otherwise CBOW, by default 1
        **kwargs (optional): Additional arguments to pass to the gensim.models.Word2Vec constructor

    Attributes:
        model (gensim.models.Word2Vec): The Word2Vec model
    """

    if model_path:
        self.model = Word2Vec.load(model_path)
    else:
        self.model = Word2Vec(
                min_count=min_count,
                window=window,
                negative=negative,
                ns_exponent=ns_exponent,
                vector_size=vector_size,
                workers=workers,
                sg=sg,
                **kwargs
                )

`train(data, output_dir=None, epochs=5, start_alpha=0.025, end_alpha=0.0001, compute_loss=True, **kwargs)`

Train the Word2Vec model on the given data

Parameters:

Name	Type	Description	Default
`data`	`List[str]`	List of documents	required
`output_dir`	`(Union[str, Path], None)`	Path to save the trained model, by default None	`None`
`epochs`	`int`	Number of epochs, by default 5	`5`
`start_alpha`	`float`	Learning rate, by default 0.025	`0.025`
`end_alpha`	`float`	Minimum learning rate, by default 0.0001	`0.0001`
`compute_loss`	`bool`	Whether to compute the loss, by default True	`True`
`**kwargs`		optional	`{}`

Examples:

>>> from semantics.feature_extraction.word2vec import Word2VecTrainer
>>> texts = ['This is a test.', 'This is another test.', 'This is a third test.']
>>> Word2VecTrainer().train(texts, epochs=1)
>>> print('Trained model: ', Word2VecTrainer().model)
Trained model:  Word2Vec(vocab=5, vector_size=100, alpha=0.025)

Source code in semantics/feature_extraction/word2vec.py

def train(
        self, 
        data: List[str],
        output_dir: Optional[Union[str, Path]] = None,
        epochs=5,
        start_alpha=0.025,
        end_alpha=0.0001,
        compute_loss=True,
        **kwargs
        ):
    """
    Train the Word2Vec model on the given data

    Args:
        data (List[str]): List of documents
        output_dir (Union[str, Path], None): Path to save the trained model, by default None
        epochs (int, optional): Number of epochs, by default 5
        start_alpha (float, optional): Learning rate, by default 0.025
        end_alpha (float, optional): Minimum learning rate, by default 0.0001
        compute_loss (bool, optional): Whether to compute the loss, by default True
        **kwargs : optional

    Examples:
        >>> from semantics.feature_extraction.word2vec import Word2VecTrainer
        >>> texts = ['This is a test.', 'This is another test.', 'This is a third test.']
        >>> Word2VecTrainer().train(texts, epochs=1)
        >>> print('Trained model: ', Word2VecTrainer().model)
        Trained model:  Word2Vec(vocab=5, vector_size=100, alpha=0.025)
    """
    self.model.build_vocab(data)
    total_examples = self.model.corpus_count
    self.model.train(
            data,
            total_examples=total_examples,
            epochs=epochs,
            start_alpha=start_alpha,
            end_alpha=end_alpha,
            compute_loss=compute_loss,
            **kwargs
            )
    if output_dir:
        self.model.save(output_dir)

Word2Vec

Word2VecAlign

Methods

__init__(model_paths)

align_models(reference_index=-1, output_dir=None, method='procrustes')

load_models()

Word2VecEmbeddings

Methods

__init__(pretrained_model_path=None)

infer_vector(word, norm=False)

Word2VecInference

Methods

__init__(pretrained_model_path=None)

get_embedding(word, norm=False)

get_similarity(word1, word2)

get_top_k_words(word, k=10)

Word2VecTrainer

Methods

__init__(model_path=None, min_count=0, window=15, negative=5, ns_exponent=0.75, vector_size=100, workers=1, sg=1, **kwargs)

train(data, output_dir=None, epochs=5, start_alpha=0.025, end_alpha=0.0001, compute_loss=True, **kwargs)

`Word2VecAlign`

`init(model_paths)`

`align_models(reference_index=-1, output_dir=None, method='procrustes')`

`load_models()`

`Word2VecEmbeddings`

`init(pretrained_model_path=None)`

`infer_vector(word, norm=False)`

`Word2VecInference`

`init(pretrained_model_path=None)`

`get_embedding(word, norm=False)`

`get_similarity(word1, word2)`

`get_top_k_words(word, k=10)`

`Word2VecTrainer`

`init(model_path=None, min_count=0, window=15, negative=5, ns_exponent=0.75, vector_size=100, workers=1, sg=1, **kwargs)`

`train(data, output_dir=None, epochs=5, start_alpha=0.025, end_alpha=0.0001, compute_loss=True, **kwargs)`