Graph generation¶
In [1]:
Copied!
import os
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
import os
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
Out[1]:
True
Load (pre-created) text units¶
In [2]:
Copied!
import pandas as pd
df_text_units = pd.read_parquet("sample-data/base_text_units.parquet")
print(f"Number of rows: {len(df_text_units)}")
print("Displaying first 5 rows:")
df_text_units.head()
import pandas as pd
df_text_units = pd.read_parquet("sample-data/base_text_units.parquet")
print(f"Number of rows: {len(df_text_units)}")
print("Displaying first 5 rows:")
df_text_units.head()
Number of rows: 39 Displaying first 5 rows:
Out[2]:
| id | document_id | text_unit | |
|---|---|---|---|
| 0 | f28e49bc-5b67-46b3-b971-6d6cb2832790 | a0192baf-d76a-40d4-bcd3-437127eef568 | A CHRISTMAS CAROL\n\n [Illustration: _"How... |
| 1 | 6fae26d7-9b26-4f79-ac78-970e69fcab95 | a0192baf-d76a-40d4-bcd3-437127eef568 | at the grindstone, Scrooge! a\nsqueezing, wre... |
| 2 | c93ae0c0-c8c3-49a9-beb0-a1e3b74efa0a | a0192baf-d76a-40d4-bcd3-437127eef568 | dismal? What reason have you to be morose? You... |
| 3 | eef3623c-46d7-4c17-99e6-a4801a779a39 | a0192baf-d76a-40d4-bcd3-437127eef568 | in Scrooge's office. They had books and paper... |
| 4 | 6e9a467e-3a92-45d3-acd4-2d1c6996ee28 | a0192baf-d76a-40d4-bcd3-437127eef568 | mighty Mansion House, gave orders to his\nfif... |
Create dependencies¶
The GraphExtractor uses -
- EntityRelationshipExtractor
- GraphsMerger
- EntityRelationshipDescriptionSummarizer
EntityRelationshipExtractor¶
In [3]:
Copied!
from langchain_openai import ChatOpenAI
from langchain_community.cache import SQLiteCache
from langchain_graphrag.indexing.graph_generation import EntityRelationshipExtractor
openai_api_key = os.getenv("LANGCHAIN_GRAPHRAG_OPENAI_CHAT_API_KEY", None)
if openai_api_key is None:
raise ValueError("Please set the LANGCHAIN_GRAPHRAG_OPENAI_CHAT_API_KEY environment variable")
er_llm = ChatOpenAI(
model="gpt-4o-mini",
temperature=0.0,
api_key=openai_api_key,
cache=SQLiteCache("openai_cache.db"), # always a good idea to use Cache
)
# There is a static method provide to build the default extractor
extractor = EntityRelationshipExtractor.build_default(llm=er_llm)
from langchain_openai import ChatOpenAI
from langchain_community.cache import SQLiteCache
from langchain_graphrag.indexing.graph_generation import EntityRelationshipExtractor
openai_api_key = os.getenv("LANGCHAIN_GRAPHRAG_OPENAI_CHAT_API_KEY", None)
if openai_api_key is None:
raise ValueError("Please set the LANGCHAIN_GRAPHRAG_OPENAI_CHAT_API_KEY environment variable")
er_llm = ChatOpenAI(
model="gpt-4o-mini",
temperature=0.0,
api_key=openai_api_key,
cache=SQLiteCache("openai_cache.db"), # always a good idea to use Cache
)
# There is a static method provide to build the default extractor
extractor = EntityRelationshipExtractor.build_default(llm=er_llm)
GraphsMerger¶
In [4]:
Copied!
from langchain_graphrag.indexing.graph_generation import GraphsMerger
graphs_merger = GraphsMerger()
from langchain_graphrag.indexing.graph_generation import GraphsMerger
graphs_merger = GraphsMerger()
EntityRelationshipDescriptionSummarizer¶
In [5]:
Copied!
from langchain_graphrag.indexing.graph_generation import (
EntityRelationshipDescriptionSummarizer,
)
es_llm = ChatOpenAI(
model="gpt-4o-mini",
temperature=0.0,
api_key=openai_api_key,
cache=SQLiteCache("openai_cache.db"), # always a good idea to use Cache
)
summarizer = EntityRelationshipDescriptionSummarizer.build_default(llm=es_llm)
from langchain_graphrag.indexing.graph_generation import (
EntityRelationshipDescriptionSummarizer,
)
es_llm = ChatOpenAI(
model="gpt-4o-mini",
temperature=0.0,
api_key=openai_api_key,
cache=SQLiteCache("openai_cache.db"), # always a good idea to use Cache
)
summarizer = EntityRelationshipDescriptionSummarizer.build_default(llm=es_llm)
Finally generating the graph¶
In [6]:
Copied!
from langchain_graphrag.indexing.graph_generation import (
GraphGenerator,
)
graph_generator = GraphGenerator(
er_extractor=extractor,
graphs_merger=GraphsMerger(),
er_description_summarizer=summarizer,
)
from langchain_graphrag.indexing.graph_generation import (
GraphGenerator,
)
graph_generator = GraphGenerator(
er_extractor=extractor,
graphs_merger=GraphsMerger(),
er_description_summarizer=summarizer,
)
In [7]:
Copied!
graph = graph_generator.run(df_text_units)
graph = graph_generator.run(df_text_units)
Extracting entities and relationships ...: 100%|██████████| 39/39 [00:00<00:00, 157.56it/s] Summarizing entities descriptions: 100%|██████████| 116/116 [00:00<00:00, 1075.01it/s] Summarizing relationship descriptions: 100%|██████████| 160/160 [00:02<00:00, 63.72it/s]
In [8]:
Copied!
print(f"Number of nodes - {len(graph.nodes)}")
print(f"Number of edges - {len(graph.edges)}")
print(f"Number of nodes - {len(graph.nodes)}")
print(f"Number of edges - {len(graph.edges)}")
Number of nodes - 116 Number of edges - 160