Do you guys have any issues with semantic chunking with AzureOpenAI
i’m facing issues currently if any one knows
please help me out
Do you guys have any issues with semantic chunking with AzureOpenAI
i’m facing issues currently if any one knows
please help me out
Hey @msmaths99, Thank you for reaching out. Could you pls share the issue you are facing, we will be able to assist you better.
I’m using azureopenai for embeddings
and trying to use semantic chunking but agno is calling its default openai embedder for semantic chunking and it’s not even using AzureOpenAI Embedder though we are passing AzureOpenAI.
“”“Knowledge base manager for handling multiple document knowledge bases”“”
import os
import logging
from dotenv import load_dotenv
from typing import Dict, Optional, Any
from agno.embedder.azure_openai import AzureOpenAIEmbedder
from agno.vectordb.pgvector import PgVector, SearchType
from agno.knowledge.pdf import PDFKnowledgeBase, PDFReader
from agno.knowledge.docx import DocxKnowledgeBase, DocxReader
from agno.knowledge.text import TextKnowledgeBase, TextReader
from agno.document.chunking.semantic import SemanticChunking
load_dotenv()
AZURE_EMBEDDER_OPENAI_API_KEY = os.getenv(“AZURE_EMBEDDER_OPENAI_API_KEY”)
AZURE_EMBEDDER_OPENAI_ENDPOINT = os.getenv(“AZURE_EMBEDDER_OPENAI_ENDPOINT”)
AZURE_EMBEDDER_DEPLOYMENT = os.getenv(“AZURE_EMBEDDER_DEPLOYMENT”)
DB_URL = os.getenv(“POSTGRES_URL”)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(name)
class KnowledgeBaseManager:
“”“Singleton class to manage knowledge bases for uploaded documents”“”
_instance = None
_knowledge_bases: Dict[str, Any] = {}
_embedder = None
def __new__(cls):
if cls._instance is None:
cls._instance = super(KnowledgeBaseManager, cls).__new__(cls)
return cls._instance
@property
def embedder(self):
"""Lazy initialization of Azure OpenAI embedder"""
if self._embedder is None:
# Validate required environment variables
if not all([AZURE_EMBEDDER_OPENAI_API_KEY, AZURE_EMBEDDER_OPENAI_ENDPOINT, AZURE_EMBEDDER_DEPLOYMENT]):
raise ValueError(
"Missing required Azure embedder environment variables: "
"AZURE_EMBEDDER_OPENAI_API_KEY, AZURE_EMBEDDER_OPENAI_ENDPOINT, AZURE_EMBEDDER_DEPLOYMENT"
)
self._embedder = AzureOpenAIEmbedder(
api_key=AZURE_EMBEDDER_OPENAI_API_KEY,
azure_endpoint=AZURE_EMBEDDER_OPENAI_ENDPOINT,
azure_deployment=AZURE_EMBEDDER_DEPLOYMENT,
# Optional: specify model if needed
# model="text-embedding-3-small"
)
logger.info(f"Initialized Azure OpenAI embedder: {type(self._embedder)}")
return self._embedder
def get_vector_db(self, table_name: str):
"""Get or create vector database for a table with Azure embedder"""
try:
vector_db = PgVector(
table_name=table_name,
db_url=DB_URL,
search_type=SearchType.hybrid,
embedder=self.embedder, # This ensures Azure embedder is used
)
logger.info(f"Created PgVector with Azure embedder for table: {table_name}")
return vector_db
except Exception as e:
logger.error(f"Failed to create vector database: {e}")
raise
def create_knowledge_base(self, document_path: str, table_name: str, file_type: str):
"""Create and load knowledge base for a document"""
try:
# Validate file exists
if not os.path.exists(document_path):
raise FileNotFoundError(f"Document not found: {document_path}")
vector_db = self.get_vector_db(table_name)
if file_type.lower() == 'pdf':
logger.info(f"Creating PDF knowledge base for: {document_path}")
kb = PDFKnowledgeBase(
path=document_path,
vector_db=vector_db,
reader=PDFReader(chunk=False),
chunking_strategy=SemanticChunking()
)
elif file_type.lower() in ['doc', 'docx']:
logger.info(f"Creating DOCX knowledge base for: {document_path}")
kb = DocxKnowledgeBase(
path=document_path,
vector_db=vector_db,
reader=DocxReader(chunk=False),
chunking_strategy=SemanticChunking()
)
elif file_type.lower() == 'txt':
logger.info(f"Creating Text knowledge base for: {document_path}")
kb = TextKnowledgeBase(
path=document_path,
vector_db=vector_db,
reader=TextReader(chunk=False),
chunking_strategy=SemanticChunking()
)
else:
raise ValueError(f"Unsupported file type: {file_type}")
# Load the document into the knowledge base
logger.info(f"Loading knowledge base for table: {table_name}")
kb.load(recreate=True)
# Store in cache
self._knowledge_bases[table_name] = kb
logger.info(f"Knowledge base created and loaded for table: {table_name}")
return kb
except Exception as e:
logger.error(f"Failed to create knowledge base: {e}")
raise
def get_knowledge_base(self, table_name: str, document_type: str = "pdf") -> Optional[Any]:
"""Get existing knowledge base or create new one"""
if table_name in self._knowledge_bases:
logger.info(f"Retrieved cached knowledge base: {table_name}")
return self._knowledge_bases[table_name]
# Try to connect to existing vector DB
try:
vector_db = self.get_vector_db(table_name)
if document_type.lower() == 'pdf':
kb = PDFKnowledgeBase(vector_db=vector_db)
elif document_type.lower() in ['doc', 'docx']:
kb = DocxKnowledgeBase(vector_db=vector_db)
elif document_type.lower() == 'txt':
kb = TextKnowledgeBase(vector_db=vector_db)
else:
raise ValueError(f"Unsupported document type: {document_type}")
self._knowledge_bases[table_name] = kb
logger.info(f"Connected to existing knowledge base: {table_name}")
return kb
except Exception as e:
logger.error(f"Could not connect to existing knowledge base: {e}")
return None
def list_knowledge_bases(self) -> Dict[str, Any]:
"""List all cached knowledge bases"""
return {
"cached_knowledge_bases": list(self._knowledge_bases.keys()),
"total_count": len(self._knowledge_bases),
"embedder_type": type(self.embedder).__name__
}
def remove_knowledge_base(self, table_name: str) -> bool:
"""Remove a knowledge base from cache"""
if table_name in self._knowledge_bases:
del self._knowledge_bases[table_name]
logger.info(f"Removed knowledge base from cache: {table_name}")
return True
return False
def validate_configuration(self) -> Dict[str, Any]:
"""Validate the current configuration"""
return {
"embedder_configured": self._embedder is not None,
"embedder_type": type(self.embedder).__name__ if self._embedder else None,
"database_url_set": bool(DB_URL),
"azure_credentials_set": bool(AZURE_EMBEDDER_OPENAI_API_KEY and AZURE_EMBEDDER_OPENAI_ENDPOINT and AZURE_EMBEDDER_DEPLOYMENT)
}
Hi @msmaths99 You need to provide the embedder in your SemanticChunking() here to ensure it uses the correct model. Otherwise it will fallback to OpenAI
STILL FACING THE SAME ERROR AND NOT RESOLVED WITH YOUR UPDATE
“”“Knowledge base manager for handling multiple document knowledge bases”“”
import os
from dotenv import load_dotenv
from typing import Dict, Optional, Any
from agno.embedder.azure_openai import AzureOpenAIEmbedder
from agno.vectordb.pgvector import PgVector, SearchType
from agno.knowledge.pdf import PDFKnowledgeBase, PDFReader
from agno.knowledge.docx import DocxKnowledgeBase, DocxReader
from agno.knowledge.text import TextKnowledgeBase, TextReader
from agno.document.chunking.semantic import SemanticChunking
# Load environment variables
load_dotenv()
# Ensure default embedder is always Azure
os.environ[“AGNO_DEFAULT_EMBEDDER”] = “azure_openai”
# Configuration
AZURE_EMBEDDED_API_KEY = os.getenv(“AZURE_EMBEDDED_API_KEY”)
AZURE_EMBEDDED_ENDPOINT = os.getenv(“AZURE_EMBEDDED_ENDPOINT”)
AZURE_EMBEDDED_API_VERSION = os.getenv(“AZURE_EMBEDDED_API_VERSION”)
AZURE_EMBEDDED_DEPLOYMENT = os.getenv(“AZURE_EMBEDDED_DEPLOYMENT”)
DB_URL = os.getenv(“POSTGRES_DB_URL”)
class KnowledgeBaseManager:
“”“Singleton class to manage knowledge bases for uploaded documents”“”
_instance = None
_knowledge_bases: Dict[str, Any] = {}
_embedder = None
def _new_(cls):
if cls._instance is None:
cls._instance = super(KnowledgeBaseManager, cls)._new_(cls)
return cls._instance
@propertyproperty
def embedder(self):
“”“Lazy initialization of embedder”“”
if self._embedder is None:
self._embedder = AzureOpenAIEmbedder(
api_key=AZURE_EMBEDDED_API_KEY,
azure_endpoint=AZURE_EMBEDDED_ENDPOINT,
api_version=AZURE_EMBEDDED_API_VERSION,
azure_deployment=AZURE_EMBEDDED_DEPLOYMENT,
)
return self._embedder
def get_vector_db(self, table_name: str):
“”“Get or create vector database for a table”“”
if not table_name:
raise ValueError(“Table name must be provided for vector DB”)
return PgVector(
table_name=table_name,
db_url=DB_URL,
search_type=SearchType.hybrid,
embedder=self.embedder,
)
def create_knowledge_base(self, document_path: str, table_name: str, file_type: str):
“”“Create and load knowledge base for a document”“”
if not table_name:
raise ValueError(“Table name must be provided when creating knowledge base”)
vector_db = self.get_vector_db(table_name)
if file_type.lower() == “pdf”:
print(“==================file type=============”, “pdf”)
kb = PDFKnowledgeBase(
path=document_path,
vector_db=vector_db,
reader=PDFReader(chunk=False),
chunking_strategy=SemanticChunking(embedder=self.embedder),
)
elif file_type.lower() in [“doc”, “docx”]:
kb = DocxKnowledgeBase(
path=document_path,
vector_db=vector_db,
reader=DocxReader(chunk=False),
chunking_strategy=SemanticChunking(embedder=self.embedder),
)
elif file_type.lower() == “txt”:
kb = TextKnowledgeBase(
path=document_path,
vector_db=vector_db,
reader=TextReader(chunk=False),
chunking_strategy=SemanticChunking(embedder=self.embedder),
)
else:
raise ValueError(f"Unsupported file type: {file_type}")
# Load the document into the knowledge base
kb.load(recreate=True)
# Store in cache
self._knowledge_bases[table_name] = kb
print(f"Knowledge base created and loaded for table: {table_name}")
# Always return both KB and table_name so downstream code never loses it
return {“kb”: kb, “table_name”: table_name}
def get_knowledge_base(self, table_name: str, document_type: str = “pdf”) → Optional[Any]:
“”“Get existing knowledge base or connect to an existing one”“”
if not table_name:
raise ValueError(“Table name must be provided when retrieving knowledge base”)
if table_name in self._knowledge_bases:
return self._knowledge_bases[table_name]
try:
vector_db = self.get_vector_db(table_name)
if document_type.lower() == “pdf”:
kb = PDFKnowledgeBase(
vector_db=vector_db,
chunking_strategy=SemanticChunking(embedder=self.embedder),
)
elif document_type.lower() in [“doc”, “docx”]:
kb = DocxKnowledgeBase(
vector_db=vector_db,
chunking_strategy=SemanticChunking(embedder=self.embedder),
)
elif document_type.lower() == “txt”:
kb = TextKnowledgeBase(
vector_db=vector_db,
chunking_strategy=SemanticChunking(embedder=self.embedder),
)
else:
raise ValueError(f"Unsupported document type: {document_type}")
self._knowledge_bases[table_name] = kb
return kb
except Exception as e:
print(f"Could not connect to existing knowledge base: {e}")
return None
def list_knowledge_bases(self) → Dict[str, Any]:
“”“List all cached knowledge bases”“”
return {
“cached_knowledge_bases”: list(self._knowledge_bases.keys()),
“total_count”: len(self._knowledge_bases),
}
def remove_knowledge_base(self, table_name: str) → bool:
“”“Remove a knowledge base from cache”“”
if table_name in self._knowledge_bases:
del self._knowledge_bases[table_name]
return True
return False
Issue is not resolved
please help me team @WillemdeJongh1 @Monali
Hey @msmaths99
Mind sharing a snippet of exactly what you are running, along with the log outputs please.
CODE:
“”“Knowledge base manager for handling multiple document knowledge bases”“”
import os
from dotenv import load_dotenv
from typing import Dict, Optional, Any
from agno.embedder.azure_openai import AzureOpenAIEmbedder
from agno.vectordb.pgvector import PgVector, SearchType
from agno.knowledge.pdf import PDFKnowledgeBase, PDFReader
from agno.knowledge.docx import DocxKnowledgeBase, DocxReader
from agno.knowledge.text import TextKnowledgeBase, TextReader
from agno.document.chunking.semantic import SemanticChunking
import logging
from agno.embedder.azure_openai import AzureOpenAIEmbedder
# Configure logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(_name_)
# Load environment variables
load_dotenv()
# Ensure default embedder is always Azure
#os.environ[“AGNO_DEFAULT_EMBEDDER”] = “azure_openai”
#os.environ[“OPENAI_API_KEY”] = “sk-proj-”
# Configuration
AZURE_EMBEDDED_API_KEY = os.getenv(“AZURE_EMBEDDED_API_KEY”)
AZURE_EMBEDDED_ENDPOINT = os.getenv(“AZURE_EMBEDDED_ENDPOINT”)
AZURE_EMBEDDED_API_VERSION = os.getenv(“AZURE_EMBEDDED_API_VERSION”)
AZURE_EMBEDDED_DEPLOYMENT = os.getenv(“AZURE_EMBEDDED_DEPLOYMENT”)
DB_URL = os.getenv(“POSTGRES_DB_URL”)
class KnowledgeBaseManager:
"""Singleton class to manage knowledge bases for uploaded documents"""
\_instance = None
\_knowledge_bases: Dict\[str, Any\] = {}
\_embedder = None
def \__new_\_(cls):
if cls.\_instance is None:
cls.\_instance = super(KnowledgeBaseManager, cls).\__new_\_(cls)
return cls.\_instance
@property
def embedder(self):
"""Lazy initialization of embedder with debug logging"""
if self.\_embedder is None:
logger.info("Initializing AzureOpenAIEmbedder for semantic chunking and vector DB...")
self.\_embedder = AzureOpenAIEmbedder(
api_key=AZURE_EMBEDDED_API_KEY,
azure_endpoint=AZURE_EMBEDDED_ENDPOINT,
api_version=AZURE_EMBEDDED_API_VERSION,
azure_deployment=AZURE_EMBEDDED_DEPLOYMENT,
)
\# 🔍 Debug: Log embedder details
logger.info(
f"Embedder initialized: "
f"Deployment='{AZURE_EMBEDDED_DEPLOYMENT}', "
f"Endpoint='{AZURE_EMBEDDED_ENDPOINT.split('//')\[-1\].split('.')\[0\]}...', "
f"API Version='{AZURE_EMBEDDED_API_VERSION}'"
)
\# Optional: If your library allows introspection of the model name
\# Or if you want to assert it based on deployment name
if "large" in AZURE_EMBEDDED_DEPLOYMENT.lower():
logger.info("Using Azure deployment of 'text-embedding-3-large' (assumed from name).")
elif "small" in AZURE_EMBEDDED_DEPLOYMENT.lower():
logger.info("Using Azure deployment of 'text-embedding-3-small' (assumed from name).")
else:
logger.warning(
f"Deployment name '{AZURE_EMBEDDED_DEPLOYMENT}' does not clearly indicate model size. "
"Ensure it's the intended embedding model."
)
return self.\_embedder
def get_vector_db(self, table_name: str):
"""Get or create vector database for a table"""
if not table_name:
raise ValueError("Table name must be provided for vector DB")
return PgVector(
table_name=table_name,
db_url=DB_URL,
search_type=SearchType.hybrid,
embedder=self.embedder,
)
def create_knowledge_base(self, document_path: str, table_name: str, file_type: str):
"""Create and load knowledge base for a document"""
if not table_name:
raise ValueError("Table name must be provided when creating knowledge base")
vector_db = self.get_vector_db(table_name)
if file_type.lower() == "pdf":
print("==================file type=============", "pdf")
kb = PDFKnowledgeBase(
path=document_path,
vector_db=vector_db,
reader=PDFReader(chunk=False),
chunking_strategy=SemanticChunking(embedder=self.embedder),
)
elif file_type.lower() in \["doc", "docx"\]:
kb = DocxKnowledgeBase(
path=document_path,
vector_db=vector_db,
reader=DocxReader(chunk=False),
chunking_strategy=SemanticChunking(embedder=self.embedder),
)
elif file_type.lower() == "txt":
kb = TextKnowledgeBase(
path=document_path,
vector_db=vector_db,
reader=TextReader(chunk=False),
chunking_strategy=SemanticChunking(embedder=self.embedder),
)
else:
raise ValueError(f"Unsupported file type: {file_type}")
\# Load the document into the knowledge base
kb.load(recreate=True)
\# Store in cache
self.\_knowledge_bases\[table_name\] = kb
print(f"Knowledge base created and loaded for table: {table_name}")
\# Always return both KB and table_name so downstream code never loses it
return {"kb": kb, "table_name": table_name}
def get_knowledge_base(self, table_name: str, document_type: str = "pdf") -> Optional\[Any\]:
"""Get existing knowledge base or connect to an existing one"""
if not table_name:
raise ValueError("Table name must be provided when retrieving knowledge base")
if table_name in self.\_knowledge_bases:
return self.\_knowledge_bases\[table_name\]
try:
vector_db = self.get_vector_db(table_name)
if document_type.lower() == "pdf":
kb = PDFKnowledgeBase(
vector_db=vector_db,
chunking_strategy=SemanticChunking(embedder=self.embedder),
)
elif document_type.lower() in \["doc", "docx"\]:
kb = DocxKnowledgeBase(
vector_db=vector_db,
chunking_strategy=SemanticChunking(embedder=self.embedder),
)
elif document_type.lower() == "txt":
kb = TextKnowledgeBase(
vector_db=vector_db,
chunking_strategy=SemanticChunking(embedder=self.embedder),
)
else:
raise ValueError(f"Unsupported document type: {document_type}")
self.\_knowledge_bases\[table_name\] = kb
return kb
except Exception as e:
print(f"Could not connect to existing knowledge base: {e}")
return None
def list_knowledge_bases(self) -> Dict\[str, Any\]:
"""List all cached knowledge bases"""
return {
"cached_knowledge_bases": list(self.\_knowledge_bases.keys()),
"total_count": len(self.\_knowledge_bases),
}
def remove_knowledge_base(self, table_name: str) -> bool:
"""Remove a knowledge base from cache"""
if table_name in self.\_knowledge_bases:
del self.\_knowledge_bases\[table_name\]
return True
return False
LOG:
INFO: 127.0.0.1:55468 - “POST /api/auth/token HTTP/1.1” 200 OK
INFO:Documents-API:[UPLOAD-BATCH] Processing file: Business_Requirements_Document.pdf
INFO:Documents-API:[UPLOAD-BATCH] Created DB document with ID: 42acb85c-8c57-4702-95bb-250d0f99ed0e
Document file saved to: C:\TruVelocity\agentic-sdlc-platform\src\storage\uploads\business_requirements_document_42acb85c-8c57-4702-95bb-250d0f99ed0e.pdf
INFO:src.agents.user_story_generator.document_embedding_generator.knowledge_base_manager:Initializing AzureOpenAIEmbedder for semantic chunking and vector DB…
INFO:src.agents.user_story_generator.document_embedding_generator.knowledge_base_manager:Embedder initialized: Deployment=‘text-embedding-3-large’, Endpoint=‘truvelocity…’, API Version=‘2024-12-01-preview’
INFO:src.agents.user_story_generator.document_embedding_generator.knowledge_base_manager:Using Azure deployment of ‘text-embedding-3-large’ (assumed from name).
==================file type============= pdf
C:\TruVelocity\agentic-sdlc-platform.venv\Lib\site-packages\chonkie\embeddings\auto.py:87: UserWarning: Failed to load text-embedding-3-small with OpenAIEmbeddings: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable
Falling back to loading default provider model.
warnings.warn(
C:\TruVelocity\agentic-sdlc-platform.venv\Lib\site-packages\chonkie\embeddings\auto.py:95: UserWarning: Failed to load the default model for OpenAIEmbeddings: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable
Falling back to SentenceTransformerEmbeddings.
warnings.warn(
INFO:Documents-API:[UPLOAD-BATCH] Processor result: {‘status’: ‘error’, ‘message’: ‘Failed to process document: Failed to load PDF: Failed to load embeddings via SentenceTransformerEmbeddings after registry/fallback failure: sentence_transformers is not available. Please install it via pip install chonkie[st]
’, ‘filename’: ‘Business_Requirements_Document.pdf’}
INFO:Documents-API:[UPLOAD-BATCH] Type: <class ‘dict’>, Keys: [‘status’, ‘message’, ‘filename’]
WARNING:Documents-API:No persona output for Business_Requirements_Document.pdf
WARNING:Documents-API:No glossary output for Business_Requirements_Document.pdf
INFO:Documents-API:[UPLOAD-BATCH] Success: 1 files processed.
INFO: 127.0.0.1:65421 - “POST /api/documents/document-upload-batch HTTP/1.1” 200 OK
Please have a look at logs and it is looking for openai embedder and sentence_transformer
@WillemdeJongh1 @windrunnner @wen.g.gong_9378 @mustafa @Kristen_play
hey guys,
please have a look at this