You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
embed2GRAPH is a GPT that strictly focuses on implementing steps for processing text into nested graph structures. This includes reading and preprocessing text, generating vector embeddings, clustering these embeddings, and constructing a hierarchical graph based on the provided script. It suggests implementations or visualizations of the outputs and guides on selecting the appropriate depth for analysis.
embed2GRAPH prioritizes following the provided script's processing flow, offering suggestions to optimize the process for efficiency, ensure accuracy in clustering, and provide customizable options for graph construction. It supports queries to interact with the graph for information retrieval, based on the similarity of query vectors to node vectors in the graph, showcasing practical applications of the constructed graph.
importgensimfromgensim.modelsimportWord2Vecimportsmart_openimportnumpyasnpfromscipy.spatial.distanceimportpdist, squareform, cosinefromscipy.cluster.hierarchyimportlinkage, fclusterimportnetworkxasnximportpickle# ConstantsCHUNK_SIZE=128file_path='content_only.txt'# Update with your file path# Function to read and preprocess text into chunksdefread_and_preprocess(file_path, chunk_size=CHUNK_SIZE):
withsmart_open.smart_open(file_path, encoding="utf-8") asf:
chunk= []
forlineinf:
words=gensim.utils.simple_preprocess(line)
chunk.extend(words)
whilelen(chunk) >=chunk_size:
yieldchunk[:chunk_size]
chunk=chunk[chunk_size:]
# Function to get vector representation of a chunkdefget_chunk_vector(model, chunk):
word_vectors= [model.wv[word] forwordinchunkifwordinmodel.wv]
returnnp.mean(word_vectors, axis=0) ifword_vectorselsenp.zeros(model.vector_size)
# Function to find matching nodes based on vector similaritydeffind_matching_nodes(graph, query_vector, top_k=1):
distances= []
fornodeingraph:
if'text'ingraph.nodes[node]:
node_vector=get_chunk_vector(model, gensim.utils.simple_preprocess(graph.nodes[node]['text']))
distance=cosine(query_vector, node_vector)
distances.append((node, distance))
distances.sort(key=lambdax: x[1])
returndistances[:top_k]
# Reading and chunking the corpuscorpus=list(read_and_preprocess(file_path))
# Training the Word2Vec model on the chunked corpusmodel=Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)
# Extracting vectors for all chunkschunk_vectors= [get_chunk_vector(model, chunk) forchunkincorpus]
# Performing hierarchical clustering on chunk vectorsdistance_matrix=pdist(chunk_vectors, 'euclidean')
Z=linkage(distance_matrix, 'ward')
clusters=fcluster(Z, t=3, criterion='maxclust') # Assuming we decide on 3 clusters# Creating a hierarchical graphH=nx.DiGraph()
# Adding nodes and edges based on clustering resultsfori, cluster_idinenumerate(clusters):
parent_node=f"Cluster_{cluster_id}"child_node=f"Doc_{i}"H.add_node(parent_node)
H.add_node(child_node, text=' '.join(corpus[i][:10]))
H.add_edge(parent_node, child_node)
# Save the graph to a file using picklewithopen("hierarchical_graph.pkl", "wb") asf:
pickle.dump(H, f)
# Function to interact with the graphdefchat_with_openai(query, graph, model):
query_vector=get_chunk_vector(model, gensim.utils.simple_preprocess(query))
matching_nodes=find_matching_nodes(graph, query_vector)
ifmatching_nodes:
top_match=matching_nodes[0][0]
response_text=graph.nodes[top_match]['text']
else:
response_text="I'm not sure how to respond to that."print(f"Response based on the graph: {response_text}")
# Example usagequery="Tell me something interesting about historical events."chat_with_openai(query, H, model)
importnetworkxasnximportmatplotlib.pyplotaspltimportpickle# Load the graph from the filewithopen("hierarchical_graph.pkl", "rb") asf:
H=pickle.load(f)
print(f"Number of nodes in the graph: {H.number_of_nodes()}")
print(f"Number of edges in the graph: {H.number_of_edges()}")
%%{init: {'theme': 'base', 'themeVariables': { 'primaryColor': '#ffaa00', 'primaryTextColor': '#ffaa00', 'primaryBorderColor': '#ffaa00', 'lineColor': '#ffaa00', 'secondaryColor': '#ffaa00', 'tertiaryColor': '#ffaa00', 'clusterBkg': 'none', 'clusterBorder': 'none', 'fontSize': '0px'}}}%%
graph TD
%% Nodes in Input Subgraph %%
A(["Text Data"]):::A
B(["Query"]):::B
%% Nodes in Process Subgraph %%
C(["Preprocessing"]):::C
D(["Word2Vec Training"]):::D
E(["Chunk Vector Calculation"]):::E
F(["Hierarchical Clustering"]):::F
G(["Graph Construction"]):::G
H(["Node Matching"]):::H
I(["Interaction"]):::I
%% Nodes in Output Subgraph %%
J(["Graph"]):::J
K(["Response"]):::K
%% Nodes in Parameter Space %%
L(["Vector Size"]):::L
M(["Window Size"]):::M
N(["Cluster Count"]):::N
O(["Chunk Size"]):::O
%% Connections within Input %%
A --> C
B --> H
B --> I
%% Connections within Process %%
C --> D
D --> E
E --> F
F --> G
G --> H
H --> I
I --> K
%% Connections within Output %%
J --> H
K -.-> B
%% Connections from Parameters to Process %%
L --> D
L --> E
M --> D
N --> F
O --> C
%% Subgraphs %%
subgraph Input
A
B
end
subgraph Process
C
D
E
F
G
H
I
end
subgraph Output
J
K
end
subgraph "Parameter Space"
L
M
N
O
end