DRIFT Search

In [1]:

Copied!

# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

In [2]:

Copied!





import os
from pathlib import Path

import pandas as pd
import tiktoken

from graphrag.config.enums import ModelType
from graphrag.config.models.drift_search_config import DRIFTSearchConfig
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager
from graphrag.query.indexer_adapters import (
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_report_embeddings,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.structured_search.drift_search.drift_context import (
    DRIFTSearchContextBuilder,
)
from graphrag.query.structured_search.drift_search.search import DRIFTSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

INPUT_DIR = "./inputs/operation dulce"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "community_reports"
COMMUNITY_TABLE = "communities"
ENTITY_TABLE = "entities"
RELATIONSHIP_TABLE = "relationships"
COVARIATE_TABLE = "covariates"
TEXT_UNIT_TABLE = "text_units"
COMMUNITY_LEVEL = 2


# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")

print(f"Entity df columns: {entity_df.columns}")

entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="default-entity-description",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)

full_content_embedding_store = LanceDBVectorStore(
    collection_name="default-community-full_content",
)
full_content_embedding_store.connect(db_uri=LANCEDB_URI)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()
import os
from pathlib import Path

import pandas as pd
import tiktoken

from graphrag.config.enums import ModelType
from graphrag.config.models.drift_search_config import DRIFTSearchConfig
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager
from graphrag.query.indexer_adapters import (
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_report_embeddings,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.structured_search.drift_search.drift_context import (
    DRIFTSearchContextBuilder,
)
from graphrag.query.structured_search.drift_search.search import DRIFTSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

INPUT_DIR = "./inputs/operation dulce"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "community_reports"
COMMUNITY_TABLE = "communities"
ENTITY_TABLE = "entities"
RELATIONSHIP_TABLE = "relationships"
COVARIATE_TABLE = "covariates"
TEXT_UNIT_TABLE = "text_units"
COMMUNITY_LEVEL = 2


# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")

print(f"Entity df columns: {entity_df.columns}")

entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="default-entity-description",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)

full_content_embedding_store = LanceDBVectorStore(
    collection_name="default-community-full_content",
)
full_content_embedding_store.connect(db_uri=LANCEDB_URI)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Entity df columns: Index(['id', 'human_readable_id', 'title', 'type', 'description',
       'text_unit_ids', 'frequency', 'degree', 'x', 'y'],
      dtype='object')
Entity count: 18
Relationship count: 54
Text unit records: 5

Out[2]:

	id	human_readable_id	text	n_tokens	document_ids	entity_ids	relationship_ids	covariate_ids
0	8e938693af886bfd081acbbe8384c3671446bff84a134a...	1	# Operation: Dulce\n\n## Chapter 1\n\nThe thru...	1200	[6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8...	[425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f...	[2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838...	[745d28dd-be20-411b-85ff-1c69ca70e7b3, 9cba185...
1	fd1f46d32e1df6cd429542aeda3d64ddf3745ccb80f443...	2	, the hollow echo of the bay a stark reminder ...	1200	[6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8...	[425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f...	[2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838...	[4f9b461f-5e8f-465d-9586-e2fc81787062, 0f74618...
2	7296d9a1f046854d59079dc183de8a054c27c4843d2979...	3	differently than praise from others. This was...	1200	[6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8...	[425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f...	[2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838...	[3ef1be9c-4080-4fac-99bd-c4a636248904, 8730b20...
3	ac72722a02ac71242a2a91fca323198d04197daf60515d...	4	contrast to the rigid silence enveloping the ...	1200	[6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8...	[425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f...	[2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838...	[2c292047-b79a-4958-ab57-7bf7d7a22c92, 3cbd18a...
4	4c277337d461a16aaf8f9760ddb8b44ef220e948a2341d...	5	a mask of duty.\n\nIn the midst of the descen...	35	[6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8...	[d084d615-3584-4ec8-9931-90aa6075c764, 4b84859...	[6efdc42e-69a2-47c0-97ec-4b296cd16d5e]	[db8da02f-f889-4bb5-8e81-ab2a72e380bb]

In [3]:

Copied!





api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]

chat_config = LanguageModelConfig(
    api_key=api_key,
    type=ModelType.OpenAIChat,
    model=llm_model,
    max_retries=20,
)
chat_model = ModelManager().get_or_create_chat_model(
    name="local_search",
    model_type=ModelType.OpenAIChat,
    config=chat_config,
)

token_encoder = tiktoken.encoding_for_model(llm_model)

embedding_config = LanguageModelConfig(
    api_key=api_key,
    type=ModelType.OpenAIEmbedding,
    model=embedding_model,
    max_retries=20,
)

text_embedder = ModelManager().get_or_create_embedding_model(
    name="local_search_embedding",
    model_type=ModelType.OpenAIEmbedding,
    config=embedding_config,
)
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]

chat_config = LanguageModelConfig(
    api_key=api_key,
    type=ModelType.OpenAIChat,
    model=llm_model,
    max_retries=20,
)
chat_model = ModelManager().get_or_create_chat_model(
    name="local_search",
    model_type=ModelType.OpenAIChat,
    config=chat_config,
)

token_encoder = tiktoken.encoding_for_model(llm_model)

embedding_config = LanguageModelConfig(
    api_key=api_key,
    type=ModelType.OpenAIEmbedding,
    model=embedding_model,
    max_retries=20,
)

text_embedder = ModelManager().get_or_create_embedding_model(
    name="local_search_embedding",
    model_type=ModelType.OpenAIEmbedding,
    config=embedding_config,
)

---------------------------------------------------------------------------
ValidationError                           Traceback (most recent call last)
Cell In[3], line 5
      2 llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
      3 embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
----> 5 chat_config = LanguageModelConfig(
      6     api_key=api_key,
      7     type=ModelType.OpenAIChat,
      8     model=llm_model,
      9     max_retries=20,
     10 )
     11 chat_model = ModelManager().get_or_create_chat_model(
     12     name="local_search",
     13     model_type=ModelType.OpenAIChat,
     14     config=chat_config,
     15 )
     17 token_encoder = tiktoken.encoding_for_model(llm_model)

File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/pydantic/main.py:253, in BaseModel.__init__(self, **data)
    251 # `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks
    252 __tracebackhide__ = True
--> 253 validated_self = self.__pydantic_validator__.validate_python(data, self_instance=self)
    254 if self is not validated_self:
    255     warnings.warn(
    256         'A custom validator is returning a value other than `self`.\n'
    257         "Returning anything other than `self` from a top level model validator isn't supported when validating via `__init__`.\n"
    258         'See the `model_validator` docs (https://docs.pydantic.dev/latest/concepts/validators/#model-validators) for more details.',
    259         stacklevel=2,
    260     )

ValidationError: 1 validation error for LanguageModelConfig
  Value error, API Key is required for ModelType.OpenAIChat when using api_key authentication. Please rerun `graphrag init` and set the API_KEY. [type=value_error, input_value={'api_key': '', 'type': "...: '', 'max_retries': 20}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/value_error

In [4]:

Copied!





def read_community_reports(
    input_dir: str,
    community_report_table: str = COMMUNITY_REPORT_TABLE,
):
    """Embeds the full content of the community reports and saves the DataFrame with embeddings to the output path."""
    input_path = Path(input_dir) / f"{community_report_table}.parquet"
    return pd.read_parquet(input_path)


report_df = read_community_reports(INPUT_DIR)
reports = read_indexer_reports(
    report_df,
    community_df,
    COMMUNITY_LEVEL,
    content_embedding_col="full_content_embeddings",
)
read_indexer_report_embeddings(reports, full_content_embedding_store)
def read_community_reports(
    input_dir: str,
    community_report_table: str = COMMUNITY_REPORT_TABLE,
):
    """Embeds the full content of the community reports and saves the DataFrame with embeddings to the output path."""
    input_path = Path(input_dir) / f"{community_report_table}.parquet"
    return pd.read_parquet(input_path)


report_df = read_community_reports(INPUT_DIR)
reports = read_indexer_reports(
    report_df,
    community_df,
    COMMUNITY_LEVEL,
    content_embedding_col="full_content_embeddings",
)
read_indexer_report_embeddings(reports, full_content_embedding_store)

In [5]:

Copied!





drift_params = DRIFTSearchConfig(
    temperature=0,
    max_tokens=12_000,
    primer_folds=1,
    drift_k_followups=3,
    n_depth=3,
    n=1,
)

context_builder = DRIFTSearchContextBuilder(
    model=chat_model,
    text_embedder=text_embedder,
    entities=entities,
    relationships=relationships,
    reports=reports,
    entity_text_embeddings=description_embedding_store,
    text_units=text_units,
    token_encoder=token_encoder,
    config=drift_params,
)

search = DRIFTSearch(
    model=chat_model, context_builder=context_builder, token_encoder=token_encoder
)
drift_params = DRIFTSearchConfig(
    temperature=0,
    max_tokens=12_000,
    primer_folds=1,
    drift_k_followups=3,
    n_depth=3,
    n=1,
)

context_builder = DRIFTSearchContextBuilder(
    model=chat_model,
    text_embedder=text_embedder,
    entities=entities,
    relationships=relationships,
    reports=reports,
    entity_text_embeddings=description_embedding_store,
    text_units=text_units,
    token_encoder=token_encoder,
    config=drift_params,
)

search = DRIFTSearch(
    model=chat_model, context_builder=context_builder, token_encoder=token_encoder
)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 11
      1 drift_params = DRIFTSearchConfig(
      2     temperature=0,
      3     max_tokens=12_000,
   (...)
      7     n=1,
      8 )
     10 context_builder = DRIFTSearchContextBuilder(
---> 11     model=chat_model,
     12     text_embedder=text_embedder,
     13     entities=entities,
     14     relationships=relationships,
     15     reports=reports,
     16     entity_text_embeddings=description_embedding_store,
     17     text_units=text_units,
     18     token_encoder=token_encoder,
     19     config=drift_params,
     20 )
     22 search = DRIFTSearch(
     23     model=chat_model, context_builder=context_builder, token_encoder=token_encoder
     24 )

NameError: name 'chat_model' is not defined

In [6]:

Copied!

resp = await search.search("Who is agent Mercer?")
resp = await search.search("Who is agent Mercer?")

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[6], line 1
----> 1 resp = await search.search("Who is agent Mercer?")

NameError: name 'search' is not defined

In [7]:

Copied!

resp.response
resp.response

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[7], line 1
----> 1 resp.response

NameError: name 'resp' is not defined

In [8]:

Copied!

print(resp.context_data)
print(resp.context_data)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[8], line 1
----> 1 print(resp.context_data)

NameError: name 'resp' is not defined