Skip to content



This class is used to infer the vector embeddings of a word from a sentence.


infer_vector(doc:str, main_word:str)
    This method is used to infer the vector embeddings of a word from a sentence.
    This method is used to prepare the BERT model for the inference.
Source code in semantics/feature_extraction/
class BertEmbeddings:
    This class is used to infer the vector embeddings of a word from a sentence.

        infer_vector(doc:str, main_word:str)
            This method is used to infer the vector embeddings of a word from a sentence.
            This method is used to prepare the BERT model for the inference.
    def __init__(
        pretrained_model_path:Union[str, Path] = None,
        self.model_path = pretrained_model_path
        if pretrained_model_path is not None:
            if not os.path.exists(pretrained_model_path):
                raise ValueError(
                    f'The path {pretrained_model_path} does not exist'
            self.model_path = Path(pretrained_model_path)

        self._tokens = []
        self.model = None
        self.vocab = False
        self.lematizer = None


    def tokens(self):
        return self._tokens

    def _bert_case_preparation(self) -> None:
        This method is used to prepare the BERT model for the inference.
        model_path = self.model_path if self.model_path is not None else 'bert-base-uncased'
        self.bert_tokenizer = BertTokenizer.from_pretrained(model_path)
        self.model = BertModel.from_pretrained(
            output_hidden_states = True,
        self.vocab = True

    def infer_vector(self, doc:str, main_word:str):
        This method is used to infer the vector embeddings of a word from a sentence.
            doc: Document to process
            main_word: Main work to extract the vector embeddings for.

        Returns: torch.Tensor

        if not self.vocab:
            raise ValueError(
                f'The Embedding model {self.model.__class__.__name__} has not been initialized'
        marked_text = "[CLS] " + doc + " [SEP]"
        tokens = self.bert_tokenizer.tokenize(marked_text)
            main_token_id = tokens.index(main_word.lower())
            idx = self.bert_tokenizer.convert_tokens_to_ids(tokens)
            segment_id = [1] * len(tokens)

            self.tokens_tensor = torch.tensor([idx])
            self.segments_tensors = torch.tensor([segment_id])

            with torch.no_grad():
                outputs = self.model(self.tokens_tensor, self.segments_tensors)
                hidden_states = outputs[2]

            return hidden_states[-2][0][main_token_id]

        except ValueError:
            raise ValueError(
                f'The word: "{main_word}" does not exist in the list of tokens: {tokens} from {doc}'

infer_vector(doc, main_word)

This method is used to infer the vector embeddings of a word from a sentence. Args: doc: Document to process main_word: Main work to extract the vector embeddings for.

Returns: torch.Tensor

Source code in semantics/feature_extraction/
def infer_vector(self, doc:str, main_word:str):
    This method is used to infer the vector embeddings of a word from a sentence.
        doc: Document to process
        main_word: Main work to extract the vector embeddings for.

    Returns: torch.Tensor

    if not self.vocab:
        raise ValueError(
            f'The Embedding model {self.model.__class__.__name__} has not been initialized'
    marked_text = "[CLS] " + doc + " [SEP]"
    tokens = self.bert_tokenizer.tokenize(marked_text)
        main_token_id = tokens.index(main_word.lower())
        idx = self.bert_tokenizer.convert_tokens_to_ids(tokens)
        segment_id = [1] * len(tokens)

        self.tokens_tensor = torch.tensor([idx])
        self.segments_tensors = torch.tensor([segment_id])

        with torch.no_grad():
            outputs = self.model(self.tokens_tensor, self.segments_tensors)
            hidden_states = outputs[2]

        return hidden_states[-2][0][main_token_id]

    except ValueError:
        raise ValueError(
            f'The word: "{main_word}" does not exist in the list of tokens: {tokens} from {doc}'