Function to generate answers for each question in evaluation dataset#
import json
import os
%run -i ./search.ipynb
def generate_answers_for_qa(evaluation_data_path, search_index_name, embedding_function, path_to_output):
try:
if os.path.exists(path_to_output):
print(f"QA already created at: {path_to_output} ")
return
with open(evaluation_data_path, "r", encoding="utf-8") as file:
evaluation_data = json.load(file)
generated_qa = []
for data in evaluation_data:
question = data["user_prompt"]
# 1. Search in the index
search_response = search_documents(
search_index_name=search_index_name,
input=question,
embedding_function=embedding_function,
)
retrieved_sources = [os.path.normpath(response["source"])
for response in search_response]
retrieved_contexts = [response["chunkContext"]
for response in search_response]
retrieved_chunk_ids = [response["chunkId"]
for response in search_response]
# 2. Create prompt with the query and retrieved documents
prompt = create_prompt(question, search_response)
# 3. Call GPT-3 model to generate an answer
# given the question and the retrieved documents
response = call_llm(prompt)
current_qa = {
"user_prompt": question,
"output_prompt": data["output_prompt"],
"context": data["context"],
"chunk_id": data["chunk_id"],
"source": os.path.normpath(data["source"]),
"root_chunk_id": data["root_chunk_id"],
"generated_output": response,
"retrieved_context": retrieved_contexts,
"retrieved_source": retrieved_sources,
"retrieved_chunk_id": retrieved_chunk_ids
}
generated_qa.append(current_qa)
with open(path_to_output, "w") as f:
json.dump(generated_qa, f)
except Exception as e:
print(e)
/Users/raouf/handsontest/ai-hands-on-lab/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
Generate answers using ADA + fixed size chunking#
Took 10 min
%run -i ./search.ipynb
evaluation_data_path = "../output/qa/evaluation/qa_pairs_solutionops.json"
search_index_name = "fixed-size-chunks-180-30-engineering-mlops-ada"
embedding_function = oai_query_embedding
path_to_output = "../output/qa/results/fixed-size-chunks-180-30-engineering-mlops-ada.json"
generate_answers_for_qa(evaluation_data_path, search_index_name, embedding_function, path_to_output)
QA already created at: ../output/qa/results/fixed-size-chunks-180-30-engineering-mlops-ada.json
Generate answers using semantic chunking + open source embedding model#
# # 1. Create the new index
# # TODO: Replace this with a name for your new index
# index_name = "semantic-chunking-eval"
# vector_size = 384
# create_index(index_name, vector_size)
# # 2. Upload the embeddings to the new index
# upload_data(file_path=pregenerated_semantic_chunks_embeddings_os,
# search_index_name=index_name)
import json
%run -i ./search.ipynb
evaluation_data_path = "../output/qa/evaluation/qa_pairs_solutionops.json"
search_index_name = "semantic-chunking-eval"
embedding_function = intfloat_e5_small_v2_query_embedding
path_to_output = "../output/qa/results/semantic-chunking-intfloat.json"
generate_answers_for_qa(evaluation_data_path, search_index_name, embedding_function, path_to_output)
QA already created at: ../output/qa/results/semantic-chunking-intfloat.json