RoBERTa

`CustomDataset`

Bases: Dataset

This class is used to create a custom dataset for the Roberta model. It inherits from torch.utils.data.Dataset.

Methods

__init__(data: List[str], tokenizer, max_length=128, truncation=True, padding=True)
    The constructor for the CustomDataset class.
__len__()
    This method is used to get the length of the dataset.
__getitem__(idx)
    This method is used to get the item at a specific index.

Source code in semantics/feature_extraction/roberta.py

class CustomDataset(Dataset):
    """
    This class is used to create a custom dataset for the Roberta model. It inherits from torch.utils.data.Dataset.

    Methods
    -------
        __init__(data: List[str], tokenizer, max_length=128, truncation=True, padding=True)
            The constructor for the CustomDataset class.
        __len__()
            This method is used to get the length of the dataset.
        __getitem__(idx)
            This method is used to get the item at a specific index.  
    """
    def __init__(
            self, 
            data: List[str], 
            tokenizer, 
            max_length=128,
            truncation=True,
            padding= "max_length",
            ):
        """
        Args:
            data (List[str]): List of strings to create a dataset from.
            tokenizer: Tokenizer to tokenize the data with.
            max_length (int): Maximum length of the input sequence. Defaults to 128.
            truncation (bool): Whether to truncate the input sequence to max_length or not. Defaults to True.
            padding (str): Whether to pad the input sequence to max_length or not. Defaults to "max_length".

        Attributes:
            tokenizer: Tokenizer to tokenize the data with.
            max_length (int): Maximum length of the input sequence. Defaults to 128.
            tokenized_data (dict): Dictionary containing the input_ids, attention_mask, and labels. 
        """
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.tokenized_data = tokenizer(data, truncation=truncation, padding=padding, max_length=max_length)

    def __len__(self):
        return len(self.tokenized_data.input_ids)

    def __getitem__(self, idx):
        """
        Retrieves the item at the specified index.

        Parameters:
            idx (int): Index of the item to retrieve.

        Returns:
            tokenized_data (dict): Dictionary containing the input_ids, attention_mask, and labels.
        """
        # Get the tokenized inputs at the specified index
        input_ids = self.tokenized_data.input_ids[idx]
        attention_mask = self.tokenized_data.attention_mask[idx]

        # Return a dictionary containing input_ids, attention_mask, and labels (if applicable)
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask
            # Add 'labels': labels if you have labels for your data
        }

`getitem(idx)`

Retrieves the item at the specified index.

Parameters:

Name	Type	Description	Default
`idx`	`int`	Index of the item to retrieve.	required

Returns:

Name	Type	Description
`tokenized_data`	`dict`	Dictionary containing the input_ids, attention_mask, and labels.

Source code in semantics/feature_extraction/roberta.py

def __getitem__(self, idx):
    """
    Retrieves the item at the specified index.

    Parameters:
        idx (int): Index of the item to retrieve.

    Returns:
        tokenized_data (dict): Dictionary containing the input_ids, attention_mask, and labels.
    """
    # Get the tokenized inputs at the specified index
    input_ids = self.tokenized_data.input_ids[idx]
    attention_mask = self.tokenized_data.attention_mask[idx]

    # Return a dictionary containing input_ids, attention_mask, and labels (if applicable)
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask
        # Add 'labels': labels if you have labels for your data
    }

`init(data, tokenizer, max_length=128, truncation=True, padding='max_length')`

Parameters:

Name	Type	Description	Default
`data`	`List[str]`	List of strings to create a dataset from.	required
`tokenizer`		Tokenizer to tokenize the data with.	required
`max_length`	`int`	Maximum length of the input sequence. Defaults to 128.	`128`
`truncation`	`bool`	Whether to truncate the input sequence to max_length or not. Defaults to True.	`True`
`padding`	`str`	Whether to pad the input sequence to max_length or not. Defaults to "max_length".	`'max_length'`

Attributes:

Name	Type	Description
`tokenizer`		Tokenizer to tokenize the data with.
`max_length`	`int`	Maximum length of the input sequence. Defaults to 128.
`tokenized_data`	`dict`	Dictionary containing the input_ids, attention_mask, and labels.

Source code in semantics/feature_extraction/roberta.py

def __init__(
        self, 
        data: List[str], 
        tokenizer, 
        max_length=128,
        truncation=True,
        padding= "max_length",
        ):
    """
    Args:
        data (List[str]): List of strings to create a dataset from.
        tokenizer: Tokenizer to tokenize the data with.
        max_length (int): Maximum length of the input sequence. Defaults to 128.
        truncation (bool): Whether to truncate the input sequence to max_length or not. Defaults to True.
        padding (str): Whether to pad the input sequence to max_length or not. Defaults to "max_length".

    Attributes:
        tokenizer: Tokenizer to tokenize the data with.
        max_length (int): Maximum length of the input sequence. Defaults to 128.
        tokenized_data (dict): Dictionary containing the input_ids, attention_mask, and labels. 
    """
    self.tokenizer = tokenizer
    self.max_length = max_length
    self.tokenized_data = tokenizer(data, truncation=truncation, padding=padding, max_length=max_length)

`RobertaEmbedding`

This class is used to infer vector embeddings from a document.

Methods

__init__(pretrained_model_path:Union[str, Path] = None)
    The constructor for the VectorEmbeddings class.
_roberta_case_preparation()
    This method is used to prepare the Roberta model for the inference.
infer_vector(doc:str, main_word:str)
    This method is used to infer the vector embeddings of a word from a document.
infer_mask_logits(doc:str)
    This method is used to infer the logits of a word from a document.

Source code in semantics/feature_extraction/roberta.py

class RobertaEmbedding:
    """
    This class is used to infer vector embeddings from a document.

    Methods
    -------
        __init__(pretrained_model_path:Union[str, Path] = None)
            The constructor for the VectorEmbeddings class.
        _roberta_case_preparation()
            This method is used to prepare the Roberta model for the inference.
        infer_vector(doc:str, main_word:str)
            This method is used to infer the vector embeddings of a word from a document.
        infer_mask_logits(doc:str)
            This method is used to infer the logits of a word from a document.
    """
    def __init__(
        self,
        pretrained_model_path: Optional[Union[str, Path]] = None,
    ):
        """
        Args:
            pretrained_model_path (str, Path, None): Path to the pretrained model. Defaults to None.

        Attributes:
            model_path (str, Path, None): Path to the pretrained model. Defaults to None.
            model (transformers.RobertaModel): RobertaModel object to infer vector embeddings from.
            MLM (transformers.RobertaForMaskedLM): RobertaForMaskedLM object to infer vector embeddings from.
            tokenizer (transformers.RobertaTokenizer): Tokenizer to tokenize the data with.
            max_length (int): Maximum length of the input sequence. Defaults to 128.
            vocab (bool): Whether the model has been initialized or not.
        """
        self.model_path = pretrained_model_path
        if pretrained_model_path is not None:
            if not os.path.exists(pretrained_model_path):
                raise ValueError(
                    f'The path {pretrained_model_path} does not exist'
                )
            self.model_path = Path(pretrained_model_path)

        self._tokens = []
        self.model = None
        self.vocab = False

        lg.set_verbosity_error()
        self._roberta_case_preparation()

    @property
    def tokens(self):
        return self._tokens

    def _roberta_case_preparation(self) -> None:
        """
        This method is used to prepare the BERT model for the inference.
        """
        model_path = self.model_path if self.model_path is not None else 'roberta-base'
        self.tokenizer = RobertaTokenizer.from_pretrained(model_path)
        self.model = RobertaModel.from_pretrained(
            model_path, 
            output_hidden_states=True
            )
        self.MLM = RobertaForMaskedLM.from_pretrained(
            model_path
        )
        # self.model_max_length = self.model.config.max_position_embeddings
        # self.mlm_max_length = self.MLM.config.max_position_embeddings
        self.model.eval()
        self.MLM.eval()
        self.vocab = True

    def infer_vector(self, doc:str, main_word:str) -> torch.Tensor:
        """
        This method is used to infer the vector embeddings of a word from a document.

        Args:
            doc (str): Document to process
            main_word (str): Main work to extract the vector embeddings for.

        Returns: 
            embeddings (torch.Tensor): Tensor of stacked embeddings of shape (num_embeddings, embedding_size) where num_embeddings is the number of times the main_word appears in the doc.

        Examples:
            >>> model = RobertaEmbedding()
            >>> model.infer_vector(doc="The brown fox jumps over the lazy dog", main_word="fox")
            tensor([[-0.2182, ..., -0.1709],
                    ...,
                    [-0.2182, ..., -0.1706]])
        """
        if not self.vocab:
            raise ValueError(
                f'The Embedding model {self.model.__class__.__name__} has not been initialized'
            )


        input_ids = self.tokenizer(doc, return_tensors="pt", max_length=512, truncation=True).input_ids
        token = self.tokenizer.encode(main_word, add_special_tokens=False)[0]

        word_token_index = torch.where(input_ids == token)[1]
        emb = []

        try:
            with torch.no_grad():
                embeddings = self.model(input_ids).last_hidden_state

            emb = [embeddings[0, idx] for idx in word_token_index]
            return torch.stack(emb)

        except:
            print(f'The word: "{main_word}" does not exist in the list of tokens')
            return torch.tensor(np.array(emb))




    def infer_mask_logits(self, doc:str) -> torch.Tensor:
        """
        This method is used to infer the logits of the mask token in a document.

        Args:
            doc (str): Document to process where the mask token is present.

        Returns: 
            logits (torch.Tensor): Tensor of stacked logits of shape (num_embeddings, logits_size) where num_embeddings is the number of times the mask token appears in the doc withing the max_length.

        Examples:
            >>> model = RobertaEmbedding()
            >>> model.infer_mask_logits(doc="The brown fox <mask> over the lazy dog")
            tensor([[-2.1816e-01,  ..., -1.7064e-01],
                    ...,
                    [-2.1816e-01, ..., -1.7093e-01]])
        """

        if not self.vocab:
            raise ValueError(
                f'The Embedding model {self.MLM.__class__.__name__} has not been initialized'
            )

        input_ids = self.tokenizer(doc, return_tensors="pt", max_length= 512, truncation=True).input_ids
        mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
        l = []
        try:
            with torch.no_grad():
                logits = self.MLM(input_ids).logits

            l = [logits[0, idx] for idx in mask_token_index]
            return torch.stack(l) if len(l) > 0 else torch.empty(0)

        except IndexError:
            raise ValueError(f'The mask falls outside of the max length of {512}, please use a smaller document')

`init(pretrained_model_path=None)`

Parameters:

Name	Type	Description	Default
`pretrained_model_path`	`(str, Path, None)`	Path to the pretrained model. Defaults to None.	`None`

Attributes:

Name	Type	Description
`model_path`	`(str, Path, None)`	Path to the pretrained model. Defaults to None.
`model`	`RobertaModel`	RobertaModel object to infer vector embeddings from.
`MLM`	`RobertaForMaskedLM`	RobertaForMaskedLM object to infer vector embeddings from.
`tokenizer`	`RobertaTokenizer`	Tokenizer to tokenize the data with.
`max_length`	`int`	Maximum length of the input sequence. Defaults to 128.
`vocab`	`bool`	Whether the model has been initialized or not.

Source code in semantics/feature_extraction/roberta.py

def __init__(
    self,
    pretrained_model_path: Optional[Union[str, Path]] = None,
):
    """
    Args:
        pretrained_model_path (str, Path, None): Path to the pretrained model. Defaults to None.

    Attributes:
        model_path (str, Path, None): Path to the pretrained model. Defaults to None.
        model (transformers.RobertaModel): RobertaModel object to infer vector embeddings from.
        MLM (transformers.RobertaForMaskedLM): RobertaForMaskedLM object to infer vector embeddings from.
        tokenizer (transformers.RobertaTokenizer): Tokenizer to tokenize the data with.
        max_length (int): Maximum length of the input sequence. Defaults to 128.
        vocab (bool): Whether the model has been initialized or not.
    """
    self.model_path = pretrained_model_path
    if pretrained_model_path is not None:
        if not os.path.exists(pretrained_model_path):
            raise ValueError(
                f'The path {pretrained_model_path} does not exist'
            )
        self.model_path = Path(pretrained_model_path)

    self._tokens = []
    self.model = None
    self.vocab = False

    lg.set_verbosity_error()
    self._roberta_case_preparation()

`infer_mask_logits(doc)`

This method is used to infer the logits of the mask token in a document.

Parameters:

Name	Type	Description	Default
`doc`	`str`	Document to process where the mask token is present.	required

Returns:

Name	Type	Description
`logits`	`Tensor`	Tensor of stacked logits of shape (num_embeddings, logits_size) where num_embeddings is the number of times the mask token appears in the doc withing the max_length.

Examples:

>>> model = RobertaEmbedding()
>>> model.infer_mask_logits(doc="The brown fox <mask> over the lazy dog")
tensor([[-2.1816e-01,  ..., -1.7064e-01],
        ...,
        [-2.1816e-01, ..., -1.7093e-01]])

Source code in semantics/feature_extraction/roberta.py

def infer_mask_logits(self, doc:str) -> torch.Tensor:
    """
    This method is used to infer the logits of the mask token in a document.

    Args:
        doc (str): Document to process where the mask token is present.

    Returns: 
        logits (torch.Tensor): Tensor of stacked logits of shape (num_embeddings, logits_size) where num_embeddings is the number of times the mask token appears in the doc withing the max_length.

    Examples:
        >>> model = RobertaEmbedding()
        >>> model.infer_mask_logits(doc="The brown fox <mask> over the lazy dog")
        tensor([[-2.1816e-01,  ..., -1.7064e-01],
                ...,
                [-2.1816e-01, ..., -1.7093e-01]])
    """

    if not self.vocab:
        raise ValueError(
            f'The Embedding model {self.MLM.__class__.__name__} has not been initialized'
        )

    input_ids = self.tokenizer(doc, return_tensors="pt", max_length= 512, truncation=True).input_ids
    mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
    l = []
    try:
        with torch.no_grad():
            logits = self.MLM(input_ids).logits

        l = [logits[0, idx] for idx in mask_token_index]
        return torch.stack(l) if len(l) > 0 else torch.empty(0)

    except IndexError:
        raise ValueError(f'The mask falls outside of the max length of {512}, please use a smaller document')

`infer_vector(doc, main_word)`

This method is used to infer the vector embeddings of a word from a document.

Parameters:

Name	Type	Description	Default
`doc`	`str`	Document to process	required
`main_word`	`str`	Main work to extract the vector embeddings for.	required

Returns:

Name	Type	Description
`embeddings`	`Tensor`	Tensor of stacked embeddings of shape (num_embeddings, embedding_size) where num_embeddings is the number of times the main_word appears in the doc.

Examples:

>>> model = RobertaEmbedding()
>>> model.infer_vector(doc="The brown fox jumps over the lazy dog", main_word="fox")
tensor([[-0.2182, ..., -0.1709],
        ...,
        [-0.2182, ..., -0.1706]])

Source code in semantics/feature_extraction/roberta.py

def infer_vector(self, doc:str, main_word:str) -> torch.Tensor:
    """
    This method is used to infer the vector embeddings of a word from a document.

    Args:
        doc (str): Document to process
        main_word (str): Main work to extract the vector embeddings for.

    Returns: 
        embeddings (torch.Tensor): Tensor of stacked embeddings of shape (num_embeddings, embedding_size) where num_embeddings is the number of times the main_word appears in the doc.

    Examples:
        >>> model = RobertaEmbedding()
        >>> model.infer_vector(doc="The brown fox jumps over the lazy dog", main_word="fox")
        tensor([[-0.2182, ..., -0.1709],
                ...,
                [-0.2182, ..., -0.1706]])
    """
    if not self.vocab:
        raise ValueError(
            f'The Embedding model {self.model.__class__.__name__} has not been initialized'
        )


    input_ids = self.tokenizer(doc, return_tensors="pt", max_length=512, truncation=True).input_ids
    token = self.tokenizer.encode(main_word, add_special_tokens=False)[0]

    word_token_index = torch.where(input_ids == token)[1]
    emb = []

    try:
        with torch.no_grad():
            embeddings = self.model(input_ids).last_hidden_state

        emb = [embeddings[0, idx] for idx in word_token_index]
        return torch.stack(emb)

    except:
        print(f'The word: "{main_word}" does not exist in the list of tokens')
        return torch.tensor(np.array(emb))

`RobertaInference`

Wrapper class for the RobertaEmbedding class for inference.

Methods

__init__(pretrained_model_path:Union[str, Path] = None)
    The constructor for the VectorEmbeddings class.
_roberta_case_preparation()
    This method is used to prepare the Roberta model for the inference.
get_embedding(word:str, doc: Optional[Union[str, List[str]]] = None,, mask:bool=False)
    This method is used to infer the vector embeddings of a word from a document.
get_top_k_words(word:str, doc:str, k:int=3)
    This method is used to infer the vector embeddings of a word from a document.

Source code in semantics/feature_extraction/roberta.py

class RobertaInference:
    """
    Wrapper class for the RobertaEmbedding class for inference.

    Methods
    -------
        __init__(pretrained_model_path:Union[str, Path] = None)
            The constructor for the VectorEmbeddings class.
        _roberta_case_preparation()
            This method is used to prepare the Roberta model for the inference.
        get_embedding(word:str, doc: Optional[Union[str, List[str]]] = None,, mask:bool=False)
            This method is used to infer the vector embeddings of a word from a document.
        get_top_k_words(word:str, doc:str, k:int=3)
            This method is used to infer the vector embeddings of a word from a document.
    """

    def __init__(
            self,
            pretrained_model_path:Union[str, Path] = None,
    ):
        """
        Args:
            pretrained_model_path (str, Path, None): Path to the pretrained model. Defaults to None.

        Attributes:
            model_path (str, Path, None): Path to the pretrained model. Defaults to None.
            word_vectorizor (RobertaEmbedding): RobertaEmbedding object to infer vector embeddings from.
            vocab (bool): Whether the model has been initialized or not.
        """
        self.model_path = pretrained_model_path
        if pretrained_model_path is not None:
            if not os.path.exists(pretrained_model_path):
                raise ValueError(
                    f'The path {pretrained_model_path} does not exist'
                )
            self.model_path = Path(pretrained_model_path)

        self.word_vectorizor = None
        self.vocab = False


        lg.set_verbosity_error()
        self._roberta_case_preparation()


    def _roberta_case_preparation(self) -> None:
        """
        This method is used to prepare the Roberta model for the inference.
        """
        model_path = self.model_path if self.model_path is not None else 'roberta-base'
        self.tokenizer = RobertaTokenizer.from_pretrained(model_path)
        self.word_vectorizor = RobertaEmbedding(pretrained_model_path=model_path)
        self.vocab = True


    def get_embedding(
            self,
            main_word : str, 
            doc: Optional[Union[str, List[str]]] = None,
            mask : bool = False
            ) -> torch.Tensor:

        """
        This method is used to infer the vector embeddings of a word from a document.

        Args:
            main_word (str): Word to get the vector embeddings for
            doc (str, List[str], None): Documents to get the vector embeddings of the main_word from. If None, the document is the main_word itself. Defaults to None.
            mask: Whether to mask the main_word in the documents or not. Defaults to False.

        Returns: 
            embeddings (torch.Tensor): Tensor of stacked embeddings of shape (num_embeddings, embedding_size) where num_embeddings is the number of times the main_word appears in the doc, depending on the mask parameter.

        Examples:
            >>> model = RobertaInference()
            >>> model.get_embedding(main_word="office", doc="The brown office is very big", mask=False)
            tensor([[-0.2182, ..., -0.1709],
                    ...,
                    [-0.2182, ..., -0.1706]])
        """

        if not self.vocab:
            raise ValueError(
                f'The Embedding model {self.model.__class__.__name__} has not been initialized'
            )

        if doc is None:
            doc = ' ' + main_word.strip() + ' '

        if mask:
            doc = doc.replace(main_word, self.tokenizer.mask_token)
            main_word = self.tokenizer.mask_token

        else:
            main_word = ' ' + main_word.strip()

        embeddings = self.word_vectorizor.infer_vector(doc=doc, main_word=main_word)
        return embeddings

    def get_top_k_words(
            self,
            main_word : str,
            doc: str,
            k: int = 3
            ) -> List[str]:
        """
        This method is used to infer the vector embeddings of a main_word from a document.
        Args:
            main_word: Word to mask
            doc: Document to infer the top k words of the main_word from
            k: Number of top words to return

        Returns:
            top_k_words (List[str]): List of top k words

        Examples:
            >>> model = RobertaInference()
            >>> model.get_top_k_words(main_word="office", doc="The brown office is very big")
            ['room', 'eye', 'bear']
        """
        if not self.vocab:
            raise ValueError(
                f'The Embedding model {self.model.__class__.__name__} has not been initialized'
            )

        masked_doc = doc.replace(main_word, '<mask>')
        try:
            logits = self.word_vectorizor.infer_mask_logits(doc=masked_doc)
            top_k = []

            for logit_set in logits:
                top_k_tokens = torch.topk(logit_set, k).indices
                top_k_words = [self.tokenizer.decode(token.item()).strip() for token in top_k_tokens]

                top_k.extend(top_k_words)

            return top_k

        except ValueError:
            print(f'The word: "{main_word}" does not exist in the list of tokens')
            return []

`init(pretrained_model_path=None)`

Parameters:

Name	Type	Description	Default
`pretrained_model_path`	`(str, Path, None)`	Path to the pretrained model. Defaults to None.	`None`

Attributes:

Name	Type	Description
`model_path`	`(str, Path, None)`	Path to the pretrained model. Defaults to None.
`word_vectorizor`	`RobertaEmbedding`	RobertaEmbedding object to infer vector embeddings from.
`vocab`	`bool`	Whether the model has been initialized or not.

Source code in semantics/feature_extraction/roberta.py

def __init__(
        self,
        pretrained_model_path:Union[str, Path] = None,
):
    """
    Args:
        pretrained_model_path (str, Path, None): Path to the pretrained model. Defaults to None.

    Attributes:
        model_path (str, Path, None): Path to the pretrained model. Defaults to None.
        word_vectorizor (RobertaEmbedding): RobertaEmbedding object to infer vector embeddings from.
        vocab (bool): Whether the model has been initialized or not.
    """
    self.model_path = pretrained_model_path
    if pretrained_model_path is not None:
        if not os.path.exists(pretrained_model_path):
            raise ValueError(
                f'The path {pretrained_model_path} does not exist'
            )
        self.model_path = Path(pretrained_model_path)

    self.word_vectorizor = None
    self.vocab = False


    lg.set_verbosity_error()
    self._roberta_case_preparation()

`get_embedding(main_word, doc=None, mask=False)`

This method is used to infer the vector embeddings of a word from a document.

Parameters:

Name	Type	Description	Default
`main_word`	`str`	Word to get the vector embeddings for	required
`doc`	`(str, List[str], None)`	Documents to get the vector embeddings of the main_word from. If None, the document is the main_word itself. Defaults to None.	`None`
`mask`	`bool`	Whether to mask the main_word in the documents or not. Defaults to False.	`False`

Returns:

Name	Type	Description
`embeddings`	`Tensor`	Tensor of stacked embeddings of shape (num_embeddings, embedding_size) where num_embeddings is the number of times the main_word appears in the doc, depending on the mask parameter.

Examples:

>>> model = RobertaInference()
>>> model.get_embedding(main_word="office", doc="The brown office is very big", mask=False)
tensor([[-0.2182, ..., -0.1709],
        ...,
        [-0.2182, ..., -0.1706]])

Source code in semantics/feature_extraction/roberta.py

def get_embedding(
        self,
        main_word : str, 
        doc: Optional[Union[str, List[str]]] = None,
        mask : bool = False
        ) -> torch.Tensor:

    """
    This method is used to infer the vector embeddings of a word from a document.

    Args:
        main_word (str): Word to get the vector embeddings for
        doc (str, List[str], None): Documents to get the vector embeddings of the main_word from. If None, the document is the main_word itself. Defaults to None.
        mask: Whether to mask the main_word in the documents or not. Defaults to False.

    Returns: 
        embeddings (torch.Tensor): Tensor of stacked embeddings of shape (num_embeddings, embedding_size) where num_embeddings is the number of times the main_word appears in the doc, depending on the mask parameter.

    Examples:
        >>> model = RobertaInference()
        >>> model.get_embedding(main_word="office", doc="The brown office is very big", mask=False)
        tensor([[-0.2182, ..., -0.1709],
                ...,
                [-0.2182, ..., -0.1706]])
    """

    if not self.vocab:
        raise ValueError(
            f'The Embedding model {self.model.__class__.__name__} has not been initialized'
        )

    if doc is None:
        doc = ' ' + main_word.strip() + ' '

    if mask:
        doc = doc.replace(main_word, self.tokenizer.mask_token)
        main_word = self.tokenizer.mask_token

    else:
        main_word = ' ' + main_word.strip()

    embeddings = self.word_vectorizor.infer_vector(doc=doc, main_word=main_word)
    return embeddings

`get_top_k_words(main_word, doc, k=3)`

This method is used to infer the vector embeddings of a main_word from a document. Args: main_word: Word to mask doc: Document to infer the top k words of the main_word from k: Number of top words to return

Returns:

Name	Type	Description
`top_k_words`	`List[str]`	List of top k words

Examples:

>>> model = RobertaInference()
>>> model.get_top_k_words(main_word="office", doc="The brown office is very big")
['room', 'eye', 'bear']

Source code in semantics/feature_extraction/roberta.py

def get_top_k_words(
        self,
        main_word : str,
        doc: str,
        k: int = 3
        ) -> List[str]:
    """
    This method is used to infer the vector embeddings of a main_word from a document.
    Args:
        main_word: Word to mask
        doc: Document to infer the top k words of the main_word from
        k: Number of top words to return

    Returns:
        top_k_words (List[str]): List of top k words

    Examples:
        >>> model = RobertaInference()
        >>> model.get_top_k_words(main_word="office", doc="The brown office is very big")
        ['room', 'eye', 'bear']
    """
    if not self.vocab:
        raise ValueError(
            f'The Embedding model {self.model.__class__.__name__} has not been initialized'
        )

    masked_doc = doc.replace(main_word, '<mask>')
    try:
        logits = self.word_vectorizor.infer_mask_logits(doc=masked_doc)
        top_k = []

        for logit_set in logits:
            top_k_tokens = torch.topk(logit_set, k).indices
            top_k_words = [self.tokenizer.decode(token.item()).strip() for token in top_k_tokens]

            top_k.extend(top_k_words)

        return top_k

    except ValueError:
        print(f'The word: "{main_word}" does not exist in the list of tokens')
        return []

`RobertaTrainer`

This class is used to train a Roberta model.

Methods

__init__(model_name="roberta-base", max_length=128, mlm_probability=0.15, batch_size=4, learning_rate=1e-5, epochs=3, warmup_steps=500, split_ratio=0.8)
    The constructor for the RobertaTrainer class.
prepare_dataset(data)
    This method is used to prepare the dataset for training.
train(data, output_dir: Union[str, Path] = None)
    This method is used to train the model.

Source code in semantics/feature_extraction/roberta.py

class RobertaTrainer:
    """
    This class is used to train a Roberta model.

    Methods
    -------
        __init__(model_name="roberta-base", max_length=128, mlm_probability=0.15, batch_size=4, learning_rate=1e-5, epochs=3, warmup_steps=500, split_ratio=0.8)
            The constructor for the RobertaTrainer class.
        prepare_dataset(data)
            This method is used to prepare the dataset for training.
        train(data, output_dir: Union[str, Path] = None)
            This method is used to train the model.
    """
    def __init__(
            self, 
            model_name: str = "roberta-base", 
            max_length: int = 128, 
            mlm_probability: float = 0.15, 
            batch_size: int = 4, 
            learning_rate: float = 1e-5, 
            epochs: int = 3, 
            warmup_steps: int = 500, 
            split_ratio: float = 0.8, 
            truncation: bool = True, 
            padding: str = "max_length"
            ):

        """
        Args:
            model_name (str): Name of the model to train. Defaults to "roberta-base".
            max_length (int): Maximum length of the input sequence. Defaults to 128.
            mlm_probability (float): Probability of masking tokens in the input sequence. Defaults to 0.15.
            batch_size (int): Size of the batch. Defaults to 4.
            learning_rate (float): Learning rate of the optimizer. Defaults to 1e-5.
            epochs (int): Number of epochs to train the model for. Defaults to 3.
            warmup_steps (int): Number of warmup steps for the learning rate scheduler. Defaults to 500.
            split_ratio (float): Ratio to split the data into train and test. Defaults to 0.8.
            truncation (bool): Whether to truncate the input sequence to max_length or not. Defaults to True.
            padding (str): Whether to pad the input sequence to max_length or not. Defaults to "max_length".

        Attributes:
            tokenizer (transformers.RobertaTokenizer): Tokenizer to tokenize the data with.
            model (transformers.RobertaForMaskedLM): Model to train.
            data_collator (transformers.DataCollatorForLanguageModeling): DataCollatorForLanguageModeling object to collate the data.
            split_ratio (float): Ratio to split the data into train and test. Defaults to 0.8.
            truncation (bool): Whether to truncate the input sequence to max_length or not. Defaults to True.
            padding (str): Whether to pad the input sequence to max_length or not. Defaults to "max_length".
            max_length (int): Maximum length of the input sequence. Defaults to 128.
            batch_size (int): Size of the batch. Defaults to 4.
            learning_rate (float): Learning rate of the optimizer. Defaults to 1e-5.
            epochs (int): Number of epochs to train the model for. Defaults to 3.
            warmup_steps (int): Number of warmup steps for the learning rate scheduler. Defaults to 500.
            accelerator (accelerate.Accelerator): Accelerator object to distribute the training across multiple GPUs.
        """


        self.tokenizer = RobertaTokenizer.from_pretrained(model_name)
        self.model = RobertaForMaskedLM.from_pretrained(model_name)

        self.data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer, 
            mlm=True, 
            mlm_probability=mlm_probability
            )

        self.split_ratio = split_ratio
        self.truncation = truncation
        self.padding = padding
        self.max_length = max_length
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.warmup_steps = warmup_steps
        self.accelerator = Accelerator()

    def prepare_dataset(self, data: List[str]):
        """
        This method is used to prepare the dataset for training.
        Args:
            data: List of strings to train the model on.

        Returns:
            train_loader (torch.utils.data.DataLoader): DataLoader object containing the training data.
            dataset (CustomDataset): CustomDataset object containing the training data.
        """
        dataset = CustomDataset(
            data, 
            self.tokenizer, 
            max_length=self.max_length,
            truncation=self.truncation,
            padding=self.padding
            )

        train_loader = DataLoader(
            dataset, 
            batch_size=self.batch_size, 
            shuffle=True, 
            collate_fn=self.data_collator
            )

        return train_loader, dataset

    def train(
            self, 
            data: List[str],
            output_dir: Optional[Union[str, Path]] = None
            ) -> None:
        """
        This method is used to train the model.

        Args:
            data (List[str]): List of strings to train the model on.
            output_dir (str, Path, None): Path to save the model to. Defaults to None.


        Examples:
            >>> model = RobertaTrainer(epoch=3)
            >>> model.train(data=["The brown fox jumps over the lazy dog", "The brown fox jumps over the lazy dog", "Hello world!"], output_dir="../../output/MLM_roberta")
            Epoch: 0 | Loss: 1.1637206077575684 | Perplexity: 3.2020153999328613
            Epoch: 1 | Loss: 0.6941609382629395 | Perplexity: 2.0011680126190186
            Epoch: 2 | Loss: 0.4749067425727844 | Perplexity: 1.608262062072754  
        """

        train_data, test_data = train_test_split(
            data, 
            test_ratio=1 - self.split_ratio, 
            random_seed=42
            )

        train_loader, _ = self.prepare_dataset(train_data)
        test_loader, _ = self.prepare_dataset(test_data)

        optimizer = optim.AdamW(
            self.model.parameters(), 
            lr=self.learning_rate
            )

        model, optimizer, train_loader, test_loader = self.accelerator.prepare(
            self.model, 
            optimizer, 
            train_loader, 
            test_loader
            )

        scheduler = get_linear_schedule_with_warmup(
            optimizer, 
            num_warmup_steps=self.warmup_steps, 
            num_training_steps=len(train_loader) * self.epochs
            )

        progress_bar = tqdm.tqdm(
            range(len(train_loader) * self.epochs), 
            desc="Training", 
            dynamic_ncols=True
            )

        for epoch in range(self.epochs):
            self.model.train()

            for batch in train_loader:
                outputs = self.model(**batch)
                loss = outputs.loss
                self.accelerator.backward(loss)
                optimizer.step()
                scheduler.step()  # Update learning rate scheduler
                optimizer.zero_grad()
                progress_bar.update(1)

            self.model.eval()
            losses = []
            for step, batch in enumerate(test_loader):
                with torch.no_grad():
                    outputs = self.model(**batch)

                loss = outputs.loss
                losses.append(self.accelerator.gather(loss.repeat(self.batch_size)))

            losses = torch.cat(losses)
            losses = losses[:len(test_data)]

            try:
                perplexity = math.exp(torch.mean(losses))
            except OverflowError:
                perplexity = float("inf")
            print(f"Epoch: {epoch} | Loss: {torch.mean(losses)} | Perplexity: {perplexity}")

            # Save model
            if output_dir is not None:
                self.accelerator.wait_for_everyone()
                unwrapped_model = self.accelerator.unwrap_model(model)
                unwrapped_model.save_pretrained(output_dir, save_function=self.accelerator.save)
                if self.accelerator.is_main_process:
                    self.tokenizer.save_pretrained(output_dir)

`init(model_name='roberta-base', max_length=128, mlm_probability=0.15, batch_size=4, learning_rate=1e-05, epochs=3, warmup_steps=500, split_ratio=0.8, truncation=True, padding='max_length')`

Parameters:

Name	Type	Description	Default
`model_name`	`str`	Name of the model to train. Defaults to "roberta-base".	`'roberta-base'`
`max_length`	`int`	Maximum length of the input sequence. Defaults to 128.	`128`
`mlm_probability`	`float`	Probability of masking tokens in the input sequence. Defaults to 0.15.	`0.15`
`batch_size`	`int`	Size of the batch. Defaults to 4.	`4`
`learning_rate`	`float`	Learning rate of the optimizer. Defaults to 1e-5.	`1e-05`
`epochs`	`int`	Number of epochs to train the model for. Defaults to 3.	`3`
`warmup_steps`	`int`	Number of warmup steps for the learning rate scheduler. Defaults to 500.	`500`
`split_ratio`	`float`	Ratio to split the data into train and test. Defaults to 0.8.	`0.8`
`truncation`	`bool`	Whether to truncate the input sequence to max_length or not. Defaults to True.	`True`
`padding`	`str`	Whether to pad the input sequence to max_length or not. Defaults to "max_length".	`'max_length'`

Attributes:

Name	Type	Description
`tokenizer`	`RobertaTokenizer`	Tokenizer to tokenize the data with.
`model`	`RobertaForMaskedLM`	Model to train.
`data_collator`	`DataCollatorForLanguageModeling`	DataCollatorForLanguageModeling object to collate the data.
`split_ratio`	`float`	Ratio to split the data into train and test. Defaults to 0.8.
`truncation`	`bool`	Whether to truncate the input sequence to max_length or not. Defaults to True.
`padding`	`str`	Whether to pad the input sequence to max_length or not. Defaults to "max_length".
`max_length`	`int`	Maximum length of the input sequence. Defaults to 128.
`batch_size`	`int`	Size of the batch. Defaults to 4.
`learning_rate`	`float`	Learning rate of the optimizer. Defaults to 1e-5.
`epochs`	`int`	Number of epochs to train the model for. Defaults to 3.
`warmup_steps`	`int`	Number of warmup steps for the learning rate scheduler. Defaults to 500.
`accelerator`	`Accelerator`	Accelerator object to distribute the training across multiple GPUs.

Source code in semantics/feature_extraction/roberta.py

def __init__(
        self, 
        model_name: str = "roberta-base", 
        max_length: int = 128, 
        mlm_probability: float = 0.15, 
        batch_size: int = 4, 
        learning_rate: float = 1e-5, 
        epochs: int = 3, 
        warmup_steps: int = 500, 
        split_ratio: float = 0.8, 
        truncation: bool = True, 
        padding: str = "max_length"
        ):

    """
    Args:
        model_name (str): Name of the model to train. Defaults to "roberta-base".
        max_length (int): Maximum length of the input sequence. Defaults to 128.
        mlm_probability (float): Probability of masking tokens in the input sequence. Defaults to 0.15.
        batch_size (int): Size of the batch. Defaults to 4.
        learning_rate (float): Learning rate of the optimizer. Defaults to 1e-5.
        epochs (int): Number of epochs to train the model for. Defaults to 3.
        warmup_steps (int): Number of warmup steps for the learning rate scheduler. Defaults to 500.
        split_ratio (float): Ratio to split the data into train and test. Defaults to 0.8.
        truncation (bool): Whether to truncate the input sequence to max_length or not. Defaults to True.
        padding (str): Whether to pad the input sequence to max_length or not. Defaults to "max_length".

    Attributes:
        tokenizer (transformers.RobertaTokenizer): Tokenizer to tokenize the data with.
        model (transformers.RobertaForMaskedLM): Model to train.
        data_collator (transformers.DataCollatorForLanguageModeling): DataCollatorForLanguageModeling object to collate the data.
        split_ratio (float): Ratio to split the data into train and test. Defaults to 0.8.
        truncation (bool): Whether to truncate the input sequence to max_length or not. Defaults to True.
        padding (str): Whether to pad the input sequence to max_length or not. Defaults to "max_length".
        max_length (int): Maximum length of the input sequence. Defaults to 128.
        batch_size (int): Size of the batch. Defaults to 4.
        learning_rate (float): Learning rate of the optimizer. Defaults to 1e-5.
        epochs (int): Number of epochs to train the model for. Defaults to 3.
        warmup_steps (int): Number of warmup steps for the learning rate scheduler. Defaults to 500.
        accelerator (accelerate.Accelerator): Accelerator object to distribute the training across multiple GPUs.
    """


    self.tokenizer = RobertaTokenizer.from_pretrained(model_name)
    self.model = RobertaForMaskedLM.from_pretrained(model_name)

    self.data_collator = DataCollatorForLanguageModeling(
        tokenizer=self.tokenizer, 
        mlm=True, 
        mlm_probability=mlm_probability
        )

    self.split_ratio = split_ratio
    self.truncation = truncation
    self.padding = padding
    self.max_length = max_length
    self.batch_size = batch_size
    self.learning_rate = learning_rate
    self.epochs = epochs
    self.warmup_steps = warmup_steps
    self.accelerator = Accelerator()

`prepare_dataset(data)`

This method is used to prepare the dataset for training. Args: data: List of strings to train the model on.

Returns:

Name	Type	Description
`train_loader`	`DataLoader`	DataLoader object containing the training data.
`dataset`	`CustomDataset`	CustomDataset object containing the training data.

Source code in semantics/feature_extraction/roberta.py

def prepare_dataset(self, data: List[str]):
    """
    This method is used to prepare the dataset for training.
    Args:
        data: List of strings to train the model on.

    Returns:
        train_loader (torch.utils.data.DataLoader): DataLoader object containing the training data.
        dataset (CustomDataset): CustomDataset object containing the training data.
    """
    dataset = CustomDataset(
        data, 
        self.tokenizer, 
        max_length=self.max_length,
        truncation=self.truncation,
        padding=self.padding
        )

    train_loader = DataLoader(
        dataset, 
        batch_size=self.batch_size, 
        shuffle=True, 
        collate_fn=self.data_collator
        )

    return train_loader, dataset

`train(data, output_dir=None)`

This method is used to train the model.

Parameters:

Name	Type	Description	Default
`data`	`List[str]`	List of strings to train the model on.	required
`output_dir`	`(str, Path, None)`	Path to save the model to. Defaults to None.	`None`

Examples:

>>> model = RobertaTrainer(epoch=3)
>>> model.train(data=["The brown fox jumps over the lazy dog", "The brown fox jumps over the lazy dog", "Hello world!"], output_dir="../../output/MLM_roberta")
Epoch: 0 | Loss: 1.1637206077575684 | Perplexity: 3.2020153999328613
Epoch: 1 | Loss: 0.6941609382629395 | Perplexity: 2.0011680126190186
Epoch: 2 | Loss: 0.4749067425727844 | Perplexity: 1.608262062072754

Source code in semantics/feature_extraction/roberta.py

def train(
        self, 
        data: List[str],
        output_dir: Optional[Union[str, Path]] = None
        ) -> None:
    """
    This method is used to train the model.

    Args:
        data (List[str]): List of strings to train the model on.
        output_dir (str, Path, None): Path to save the model to. Defaults to None.


    Examples:
        >>> model = RobertaTrainer(epoch=3)
        >>> model.train(data=["The brown fox jumps over the lazy dog", "The brown fox jumps over the lazy dog", "Hello world!"], output_dir="../../output/MLM_roberta")
        Epoch: 0 | Loss: 1.1637206077575684 | Perplexity: 3.2020153999328613
        Epoch: 1 | Loss: 0.6941609382629395 | Perplexity: 2.0011680126190186
        Epoch: 2 | Loss: 0.4749067425727844 | Perplexity: 1.608262062072754  
    """

    train_data, test_data = train_test_split(
        data, 
        test_ratio=1 - self.split_ratio, 
        random_seed=42
        )

    train_loader, _ = self.prepare_dataset(train_data)
    test_loader, _ = self.prepare_dataset(test_data)

    optimizer = optim.AdamW(
        self.model.parameters(), 
        lr=self.learning_rate
        )

    model, optimizer, train_loader, test_loader = self.accelerator.prepare(
        self.model, 
        optimizer, 
        train_loader, 
        test_loader
        )

    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=self.warmup_steps, 
        num_training_steps=len(train_loader) * self.epochs
        )

    progress_bar = tqdm.tqdm(
        range(len(train_loader) * self.epochs), 
        desc="Training", 
        dynamic_ncols=True
        )

    for epoch in range(self.epochs):
        self.model.train()

        for batch in train_loader:
            outputs = self.model(**batch)
            loss = outputs.loss
            self.accelerator.backward(loss)
            optimizer.step()
            scheduler.step()  # Update learning rate scheduler
            optimizer.zero_grad()
            progress_bar.update(1)

        self.model.eval()
        losses = []
        for step, batch in enumerate(test_loader):
            with torch.no_grad():
                outputs = self.model(**batch)

            loss = outputs.loss
            losses.append(self.accelerator.gather(loss.repeat(self.batch_size)))

        losses = torch.cat(losses)
        losses = losses[:len(test_data)]

        try:
            perplexity = math.exp(torch.mean(losses))
        except OverflowError:
            perplexity = float("inf")
        print(f"Epoch: {epoch} | Loss: {torch.mean(losses)} | Perplexity: {perplexity}")

        # Save model
        if output_dir is not None:
            self.accelerator.wait_for_everyone()
            unwrapped_model = self.accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(output_dir, save_function=self.accelerator.save)
            if self.accelerator.is_main_process:
                self.tokenizer.save_pretrained(output_dir)

References

torch.utils.data

RoBERTa

CustomDataset

Methods

__getitem__(idx)

__init__(data, tokenizer, max_length=128, truncation=True, padding='max_length')

RobertaEmbedding

Methods

__init__(pretrained_model_path=None)

infer_mask_logits(doc)

infer_vector(doc, main_word)

RobertaInference

Methods

__init__(pretrained_model_path=None)

get_embedding(main_word, doc=None, mask=False)

get_top_k_words(main_word, doc, k=3)

RobertaTrainer

Methods

__init__(model_name='roberta-base', max_length=128, mlm_probability=0.15, batch_size=4, learning_rate=1e-05, epochs=3, warmup_steps=500, split_ratio=0.8, truncation=True, padding='max_length')

prepare_dataset(data)

train(data, output_dir=None)

References

`CustomDataset`

`getitem(idx)`

`init(data, tokenizer, max_length=128, truncation=True, padding='max_length')`

`RobertaEmbedding`

`init(pretrained_model_path=None)`

`infer_mask_logits(doc)`

`infer_vector(doc, main_word)`

`RobertaInference`

`init(pretrained_model_path=None)`

`get_embedding(main_word, doc=None, mask=False)`

`get_top_k_words(main_word, doc, k=3)`

`RobertaTrainer`

`init(model_name='roberta-base', max_length=128, mlm_probability=0.15, batch_size=4, learning_rate=1e-05, epochs=3, warmup_steps=500, split_ratio=0.8, truncation=True, padding='max_length')`

`prepare_dataset(data)`

`train(data, output_dir=None)`