Semantic chunking with AzureOpenAI

Do you guys have any issues with semantic chunking with AzureOpenAI

i’m facing issues currently if any one knows

please help me out

Hey @msmaths99, Thank you for reaching out. Could you pls share the issue you are facing, we will be able to assist you better.

I’m using azureopenai for embeddings
and trying to use semantic chunking but agno is calling its default openai embedder for semantic chunking and it’s not even using AzureOpenAI Embedder though we are passing AzureOpenAI.

“”“Knowledge base manager for handling multiple document knowledge bases”“”

import os
import logging
from dotenv import load_dotenv
from typing import Dict, Optional, Any
from agno.embedder.azure_openai import AzureOpenAIEmbedder
from agno.vectordb.pgvector import PgVector, SearchType
from agno.knowledge.pdf import PDFKnowledgeBase, PDFReader
from agno.knowledge.docx import DocxKnowledgeBase, DocxReader
from agno.knowledge.text import TextKnowledgeBase, TextReader
from agno.document.chunking.semantic import SemanticChunking

Load environment variables

load_dotenv()

Configuration - Use correct Azure embedder environment variables

AZURE_EMBEDDER_OPENAI_API_KEY = os.getenv(“AZURE_EMBEDDER_OPENAI_API_KEY”)
AZURE_EMBEDDER_OPENAI_ENDPOINT = os.getenv(“AZURE_EMBEDDER_OPENAI_ENDPOINT”)
AZURE_EMBEDDER_DEPLOYMENT = os.getenv(“AZURE_EMBEDDER_DEPLOYMENT”)
DB_URL = os.getenv(“POSTGRES_URL”)

Set up logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(name)

class KnowledgeBaseManager:
“”“Singleton class to manage knowledge bases for uploaded documents”“”

_instance = None
_knowledge_bases: Dict[str, Any] = {}
_embedder = None

def __new__(cls):
    if cls._instance is None:
        cls._instance = super(KnowledgeBaseManager, cls).__new__(cls)
    return cls._instance

@property
def embedder(self):
    """Lazy initialization of Azure OpenAI embedder"""
    if self._embedder is None:
        # Validate required environment variables
        if not all([AZURE_EMBEDDER_OPENAI_API_KEY, AZURE_EMBEDDER_OPENAI_ENDPOINT, AZURE_EMBEDDER_DEPLOYMENT]):
            raise ValueError(
                "Missing required Azure embedder environment variables: "
                "AZURE_EMBEDDER_OPENAI_API_KEY, AZURE_EMBEDDER_OPENAI_ENDPOINT, AZURE_EMBEDDER_DEPLOYMENT"
            )
        
        self._embedder = AzureOpenAIEmbedder(
            api_key=AZURE_EMBEDDER_OPENAI_API_KEY,
            azure_endpoint=AZURE_EMBEDDER_OPENAI_ENDPOINT,
            azure_deployment=AZURE_EMBEDDER_DEPLOYMENT,
            # Optional: specify model if needed
            # model="text-embedding-3-small"
        )
        logger.info(f"Initialized Azure OpenAI embedder: {type(self._embedder)}")
    return self._embedder

def get_vector_db(self, table_name: str):
    """Get or create vector database for a table with Azure embedder"""
    try:
        vector_db = PgVector(
            table_name=table_name,
            db_url=DB_URL,
            search_type=SearchType.hybrid,
            embedder=self.embedder,  # This ensures Azure embedder is used
        )
        logger.info(f"Created PgVector with Azure embedder for table: {table_name}")
        return vector_db
    except Exception as e:
        logger.error(f"Failed to create vector database: {e}")
        raise

def create_knowledge_base(self, document_path: str, table_name: str, file_type: str):
    """Create and load knowledge base for a document"""
    try:
        # Validate file exists
        if not os.path.exists(document_path):
            raise FileNotFoundError(f"Document not found: {document_path}")
        
        vector_db = self.get_vector_db(table_name)
        
        if file_type.lower() == 'pdf':
            logger.info(f"Creating PDF knowledge base for: {document_path}")
            kb = PDFKnowledgeBase(
                path=document_path,
                vector_db=vector_db,
                reader=PDFReader(chunk=False),
                chunking_strategy=SemanticChunking()
            )
        elif file_type.lower() in ['doc', 'docx']:
            logger.info(f"Creating DOCX knowledge base for: {document_path}")
            kb = DocxKnowledgeBase(
                path=document_path,
                vector_db=vector_db,
                reader=DocxReader(chunk=False),
                chunking_strategy=SemanticChunking()
            )
        elif file_type.lower() == 'txt':
            logger.info(f"Creating Text knowledge base for: {document_path}")
            kb = TextKnowledgeBase(
                path=document_path,
                vector_db=vector_db,
                reader=TextReader(chunk=False),
                chunking_strategy=SemanticChunking()
            )
        else:
            raise ValueError(f"Unsupported file type: {file_type}")

        # Load the document into the knowledge base
        logger.info(f"Loading knowledge base for table: {table_name}")
        kb.load(recreate=True)
        
        # Store in cache
        self._knowledge_bases[table_name] = kb
        
        logger.info(f"Knowledge base created and loaded for table: {table_name}")
        return kb
        
    except Exception as e:
        logger.error(f"Failed to create knowledge base: {e}")
        raise

def get_knowledge_base(self, table_name: str, document_type: str = "pdf") -> Optional[Any]:
    """Get existing knowledge base or create new one"""
    if table_name in self._knowledge_bases:
        logger.info(f"Retrieved cached knowledge base: {table_name}")
        return self._knowledge_bases[table_name]
    
    # Try to connect to existing vector DB
    try:
        vector_db = self.get_vector_db(table_name)
        
        if document_type.lower() == 'pdf':
            kb = PDFKnowledgeBase(vector_db=vector_db)
        elif document_type.lower() in ['doc', 'docx']:
            kb = DocxKnowledgeBase(vector_db=vector_db)
        elif document_type.lower() == 'txt':
            kb = TextKnowledgeBase(vector_db=vector_db)
        else:
            raise ValueError(f"Unsupported document type: {document_type}")
                    
        self._knowledge_bases[table_name] = kb
        logger.info(f"Connected to existing knowledge base: {table_name}")
        return kb
        
    except Exception as e:
        logger.error(f"Could not connect to existing knowledge base: {e}")
        return None

def list_knowledge_bases(self) -> Dict[str, Any]:
    """List all cached knowledge bases"""
    return {
        "cached_knowledge_bases": list(self._knowledge_bases.keys()),
        "total_count": len(self._knowledge_bases),
        "embedder_type": type(self.embedder).__name__
    }

def remove_knowledge_base(self, table_name: str) -> bool:
    """Remove a knowledge base from cache"""
    if table_name in self._knowledge_bases:
        del self._knowledge_bases[table_name]
        logger.info(f"Removed knowledge base from cache: {table_name}")
        return True
    return False

def validate_configuration(self) -> Dict[str, Any]:
    """Validate the current configuration"""
    return {
        "embedder_configured": self._embedder is not None,
        "embedder_type": type(self.embedder).__name__ if self._embedder else None,
        "database_url_set": bool(DB_URL),
        "azure_credentials_set": bool(AZURE_EMBEDDER_OPENAI_API_KEY and AZURE_EMBEDDER_OPENAI_ENDPOINT and AZURE_EMBEDDER_DEPLOYMENT)
    }

Hi @msmaths99 You need to provide the embedder in your SemanticChunking() here to ensure it uses the correct model. Otherwise it will fallback to OpenAI

STILL FACING THE SAME ERROR AND NOT RESOLVED WITH YOUR UPDATE

“”“Knowledge base manager for handling multiple document knowledge bases”“”

import os

from dotenv import load_dotenv

from typing import Dict, Optional, Any

from agno.embedder.azure_openai import AzureOpenAIEmbedder

from agno.vectordb.pgvector import PgVector, SearchType

from agno.knowledge.pdf import PDFKnowledgeBase, PDFReader

from agno.knowledge.docx import DocxKnowledgeBase, DocxReader

from agno.knowledge.text import TextKnowledgeBase, TextReader

from agno.document.chunking.semantic import SemanticChunking

# Load environment variables

load_dotenv()

# Ensure default embedder is always Azure

os.environ[“AGNO_DEFAULT_EMBEDDER”] = “azure_openai”

# Configuration

AZURE_EMBEDDED_API_KEY = os.getenv(“AZURE_EMBEDDED_API_KEY”)

AZURE_EMBEDDED_ENDPOINT = os.getenv(“AZURE_EMBEDDED_ENDPOINT”)

AZURE_EMBEDDED_API_VERSION = os.getenv(“AZURE_EMBEDDED_API_VERSION”)

AZURE_EMBEDDED_DEPLOYMENT = os.getenv(“AZURE_EMBEDDED_DEPLOYMENT”)

DB_URL = os.getenv(“POSTGRES_DB_URL”)

class KnowledgeBaseManager:

“”“Singleton class to manage knowledge bases for uploaded documents”“”

_instance = None

_knowledge_bases: Dict[str, Any] = {}

_embedder = None

def _new_(cls):

if cls._instance is None:

cls._instance = super(KnowledgeBaseManager, cls)._new_(cls)

return cls._instance

@propertyproperty

def embedder(self):

“”“Lazy initialization of embedder”“”

if self._embedder is None:

self._embedder = AzureOpenAIEmbedder(

api_key=AZURE_EMBEDDED_API_KEY,

azure_endpoint=AZURE_EMBEDDED_ENDPOINT,

api_version=AZURE_EMBEDDED_API_VERSION,

azure_deployment=AZURE_EMBEDDED_DEPLOYMENT,

)

return self._embedder

def get_vector_db(self, table_name: str):

“”“Get or create vector database for a table”“”

if not table_name:

raise ValueError(“Table name must be provided for vector DB”)

return PgVector(

table_name=table_name,

db_url=DB_URL,

search_type=SearchType.hybrid,

embedder=self.embedder,

)

def create_knowledge_base(self, document_path: str, table_name: str, file_type: str):

“”“Create and load knowledge base for a document”“”

if not table_name:

raise ValueError(“Table name must be provided when creating knowledge base”)

vector_db = self.get_vector_db(table_name)

if file_type.lower() == “pdf”:

print(“==================file type=============”, “pdf”)

kb = PDFKnowledgeBase(

path=document_path,

vector_db=vector_db,

reader=PDFReader(chunk=False),

chunking_strategy=SemanticChunking(embedder=self.embedder),

)

elif file_type.lower() in [“doc”, “docx”]:

kb = DocxKnowledgeBase(

path=document_path,

vector_db=vector_db,

reader=DocxReader(chunk=False),

chunking_strategy=SemanticChunking(embedder=self.embedder),

)

elif file_type.lower() == “txt”:

kb = TextKnowledgeBase(

path=document_path,

vector_db=vector_db,

reader=TextReader(chunk=False),

chunking_strategy=SemanticChunking(embedder=self.embedder),

)

else:

raise ValueError(f"Unsupported file type: {file_type}")

# Load the document into the knowledge base

kb.load(recreate=True)

# Store in cache

self._knowledge_bases[table_name] = kb

print(f"Knowledge base created and loaded for table: {table_name}")

# Always return both KB and table_name so downstream code never loses it

return {“kb”: kb, “table_name”: table_name}

def get_knowledge_base(self, table_name: str, document_type: str = “pdf”) → Optional[Any]:

“”“Get existing knowledge base or connect to an existing one”“”

if not table_name:

raise ValueError(“Table name must be provided when retrieving knowledge base”)

if table_name in self._knowledge_bases:

return self._knowledge_bases[table_name]

try:

vector_db = self.get_vector_db(table_name)

if document_type.lower() == “pdf”:

kb = PDFKnowledgeBase(

vector_db=vector_db,

chunking_strategy=SemanticChunking(embedder=self.embedder),

)

elif document_type.lower() in [“doc”, “docx”]:

kb = DocxKnowledgeBase(

vector_db=vector_db,

chunking_strategy=SemanticChunking(embedder=self.embedder),

)

elif document_type.lower() == “txt”:

kb = TextKnowledgeBase(

vector_db=vector_db,

chunking_strategy=SemanticChunking(embedder=self.embedder),

)

else:

raise ValueError(f"Unsupported document type: {document_type}")

self._knowledge_bases[table_name] = kb

return kb

except Exception as e:

print(f"Could not connect to existing knowledge base: {e}")

return None

def list_knowledge_bases(self) → Dict[str, Any]:

“”“List all cached knowledge bases”“”

return {

“cached_knowledge_bases”: list(self._knowledge_bases.keys()),

“total_count”: len(self._knowledge_bases),

}

def remove_knowledge_base(self, table_name: str) → bool:

“”“Remove a knowledge base from cache”“”

if table_name in self._knowledge_bases:

del self._knowledge_bases[table_name]

return True

return False

Issue is not resolved

please help me team @WillemdeJongh1 @Monali

Hey @msmaths99

Mind sharing a snippet of exactly what you are running, along with the log outputs please.

CODE:

“”“Knowledge base manager for handling multiple document knowledge bases”“”

import os

from dotenv import load_dotenv

from typing import Dict, Optional, Any

from agno.embedder.azure_openai import AzureOpenAIEmbedder

from agno.vectordb.pgvector import PgVector, SearchType

from agno.knowledge.pdf import PDFKnowledgeBase, PDFReader

from agno.knowledge.docx import DocxKnowledgeBase, DocxReader

from agno.knowledge.text import TextKnowledgeBase, TextReader

from agno.document.chunking.semantic import SemanticChunking

import logging

from agno.embedder.azure_openai import AzureOpenAIEmbedder

# Configure logger

logging.basicConfig(level=logging.INFO)

logger = logging.getLogger(_name_)

# Load environment variables

load_dotenv()

# Ensure default embedder is always Azure

#os.environ[“AGNO_DEFAULT_EMBEDDER”] = “azure_openai”

#os.environ[“OPENAI_API_KEY”] = “sk-proj-”

# Configuration

AZURE_EMBEDDED_API_KEY = os.getenv(“AZURE_EMBEDDED_API_KEY”)

AZURE_EMBEDDED_ENDPOINT = os.getenv(“AZURE_EMBEDDED_ENDPOINT”)

AZURE_EMBEDDED_API_VERSION = os.getenv(“AZURE_EMBEDDED_API_VERSION”)

AZURE_EMBEDDED_DEPLOYMENT = os.getenv(“AZURE_EMBEDDED_DEPLOYMENT”)

DB_URL = os.getenv(“POSTGRES_DB_URL”)

class KnowledgeBaseManager:

"""Singleton class to manage knowledge bases for uploaded documents"""



\_instance = None

\_knowledge_bases: Dict\[str, Any\] = {}

\_embedder = None



def \__new_\_(cls):

    if cls.\_instance is None:

        cls.\_instance = super(KnowledgeBaseManager, cls).\__new_\_(cls)

    return cls.\_instance



@property

def embedder(self):

    """Lazy initialization of embedder with debug logging"""

    if self.\_embedder is None:

        logger.info("Initializing AzureOpenAIEmbedder for semantic chunking and vector DB...")



        self.\_embedder = AzureOpenAIEmbedder(

            api_key=AZURE_EMBEDDED_API_KEY,

            azure_endpoint=AZURE_EMBEDDED_ENDPOINT,

            api_version=AZURE_EMBEDDED_API_VERSION,

            azure_deployment=AZURE_EMBEDDED_DEPLOYMENT,

        )



        \# 🔍 Debug: Log embedder details

        logger.info(

            f"Embedder initialized: "

            f"Deployment='{AZURE_EMBEDDED_DEPLOYMENT}', "

            f"Endpoint='{AZURE_EMBEDDED_ENDPOINT.split('//')\[-1\].split('.')\[0\]}...', "

            f"API Version='{AZURE_EMBEDDED_API_VERSION}'"

        )



        \# Optional: If your library allows introspection of the model name

        \# Or if you want to assert it based on deployment name

        if "large" in AZURE_EMBEDDED_DEPLOYMENT.lower():

            logger.info("Using Azure deployment of 'text-embedding-3-large' (assumed from name).")

        elif "small" in AZURE_EMBEDDED_DEPLOYMENT.lower():

            logger.info("Using Azure deployment of 'text-embedding-3-small' (assumed from name).")

        else:

            logger.warning(

                f"Deployment name '{AZURE_EMBEDDED_DEPLOYMENT}' does not clearly indicate model size. "

                "Ensure it's the intended embedding model."

            )



    return self.\_embedder



def get_vector_db(self, table_name: str):

    """Get or create vector database for a table"""

    if not table_name:

        raise ValueError("Table name must be provided for vector DB")

    return PgVector(

        table_name=table_name,

        db_url=DB_URL,

        search_type=SearchType.hybrid,

        embedder=self.embedder,

    )



def create_knowledge_base(self, document_path: str, table_name: str, file_type: str):

    """Create and load knowledge base for a document"""

    if not table_name:

        raise ValueError("Table name must be provided when creating knowledge base")



    vector_db = self.get_vector_db(table_name)



    if file_type.lower() == "pdf":

        print("==================file type=============", "pdf")

        kb = PDFKnowledgeBase(

            path=document_path,

            vector_db=vector_db,

            reader=PDFReader(chunk=False),

            chunking_strategy=SemanticChunking(embedder=self.embedder),

        )

    elif file_type.lower() in \["doc", "docx"\]:

        kb = DocxKnowledgeBase(

            path=document_path,

            vector_db=vector_db,

            reader=DocxReader(chunk=False),

            chunking_strategy=SemanticChunking(embedder=self.embedder),

        )

    elif file_type.lower() == "txt":

        kb = TextKnowledgeBase(

            path=document_path,

            vector_db=vector_db,

            reader=TextReader(chunk=False),

            chunking_strategy=SemanticChunking(embedder=self.embedder),

        )

    else:

        raise ValueError(f"Unsupported file type: {file_type}")



    \# Load the document into the knowledge base

    kb.load(recreate=True)



    \# Store in cache

    self.\_knowledge_bases\[table_name\] = kb



    print(f"Knowledge base created and loaded for table: {table_name}")



    \# Always return both KB and table_name so downstream code never loses it

    return {"kb": kb, "table_name": table_name}



def get_knowledge_base(self, table_name: str, document_type: str = "pdf") -> Optional\[Any\]:

    """Get existing knowledge base or connect to an existing one"""

    if not table_name:

        raise ValueError("Table name must be provided when retrieving knowledge base")



    if table_name in self.\_knowledge_bases:

        return self.\_knowledge_bases\[table_name\]



    try:

        vector_db = self.get_vector_db(table_name)



        if document_type.lower() == "pdf":

            kb = PDFKnowledgeBase(

                vector_db=vector_db,

                chunking_strategy=SemanticChunking(embedder=self.embedder),

            )

        elif document_type.lower() in \["doc", "docx"\]:

            kb = DocxKnowledgeBase(

                vector_db=vector_db,

                chunking_strategy=SemanticChunking(embedder=self.embedder),

            )

        elif document_type.lower() == "txt":

            kb = TextKnowledgeBase(

                vector_db=vector_db,

                chunking_strategy=SemanticChunking(embedder=self.embedder),

            )

        else:

            raise ValueError(f"Unsupported document type: {document_type}")



        self.\_knowledge_bases\[table_name\] = kb

        return kb

    except Exception as e:

        print(f"Could not connect to existing knowledge base: {e}")

        return None



def list_knowledge_bases(self) -> Dict\[str, Any\]:

    """List all cached knowledge bases"""

    return {

        "cached_knowledge_bases": list(self.\_knowledge_bases.keys()),

        "total_count": len(self.\_knowledge_bases),

    }



def remove_knowledge_base(self, table_name: str) -> bool:

    """Remove a knowledge base from cache"""

    if table_name in self.\_knowledge_bases:

        del self.\_knowledge_bases\[table_name\]

        return True

    return False

LOG:
INFO: 127.0.0.1:55468 - “POST /api/auth/token HTTP/1.1” 200 OK
INFO:Documents-API:[UPLOAD-BATCH] Processing file: Business_Requirements_Document.pdf
INFO:Documents-API:[UPLOAD-BATCH] Created DB document with ID: 42acb85c-8c57-4702-95bb-250d0f99ed0e
Document file saved to: C:\TruVelocity\agentic-sdlc-platform\src\storage\uploads\business_requirements_document_42acb85c-8c57-4702-95bb-250d0f99ed0e.pdf
INFO:src.agents.user_story_generator.document_embedding_generator.knowledge_base_manager:Initializing AzureOpenAIEmbedder for semantic chunking and vector DB…
INFO:src.agents.user_story_generator.document_embedding_generator.knowledge_base_manager:Embedder initialized: Deployment=‘text-embedding-3-large’, Endpoint=‘truvelocity…’, API Version=‘2024-12-01-preview’
INFO:src.agents.user_story_generator.document_embedding_generator.knowledge_base_manager:Using Azure deployment of ‘text-embedding-3-large’ (assumed from name).
==================file type============= pdf
C:\TruVelocity\agentic-sdlc-platform.venv\Lib\site-packages\chonkie\embeddings\auto.py:87: UserWarning: Failed to load text-embedding-3-small with OpenAIEmbeddings: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable
Falling back to loading default provider model.
warnings.warn(
C:\TruVelocity\agentic-sdlc-platform.venv\Lib\site-packages\chonkie\embeddings\auto.py:95: UserWarning: Failed to load the default model for OpenAIEmbeddings: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable
Falling back to SentenceTransformerEmbeddings.
warnings.warn(
INFO:Documents-API:[UPLOAD-BATCH] Processor result: {‘status’: ‘error’, ‘message’: ‘Failed to process document: Failed to load PDF: Failed to load embeddings via SentenceTransformerEmbeddings after registry/fallback failure: sentence_transformers is not available. Please install it via pip install chonkie[st]’, ‘filename’: ‘Business_Requirements_Document.pdf’}
INFO:Documents-API:[UPLOAD-BATCH] Type: <class ‘dict’>, Keys: [‘status’, ‘message’, ‘filename’]
WARNING:Documents-API:No persona output for Business_Requirements_Document.pdf
WARNING:Documents-API:No glossary output for Business_Requirements_Document.pdf
INFO:Documents-API:[UPLOAD-BATCH] Success: 1 files processed.
INFO: 127.0.0.1:65421 - “POST /api/documents/document-upload-batch HTTP/1.1” 200 OK

Please have a look at logs and it is looking for openai embedder and sentence_transformer

@WillemdeJongh1 @windrunnner @wen.g.gong_9378 @mustafa @Kristen_play

hey guys,
please have a look at this