27 lines
948 B
Python
27 lines
948 B
Python
# embedder_nomic.py
|
|
from transformers import AutoTokenizer, AutoModel
|
|
import torch
|
|
|
|
class NomicEmbedder:
|
|
_instance = None
|
|
|
|
def __init__(self, model_path: str = ".model/nomic-embed-text-v1.5"):
|
|
# Load model only once
|
|
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
|
self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
|
|
|
|
@classmethod
|
|
def get_instance(cls) -> "NomicEmbedder":
|
|
if cls._instance is None:
|
|
cls._instance = NomicEmbedder()
|
|
return cls._instance
|
|
|
|
def embed(self, text: str) -> list[float]:
|
|
"""Generate embedding from text"""
|
|
inputs = self.tokenizer(
|
|
[text], return_tensors="pt", padding=True, truncation=True, max_length=8192
|
|
)
|
|
with torch.no_grad():
|
|
embedding = self.model(**inputs).last_hidden_state.mean(dim=1).squeeze()
|
|
return embedding.tolist()
|