Function to generate answers for each question in evaluation dataset

Function to generate answers for each question in evaluation dataset#

import json
import os
%run -i ./search.ipynb

def generate_answers_for_qa(evaluation_data_path, search_index_name, embedding_function, path_to_output):
    try:
        if os.path.exists(path_to_output):
            print(f"QA already created at: {path_to_output} ")
            return
        with open(evaluation_data_path, "r", encoding="utf-8") as file:
            evaluation_data = json.load(file)
            generated_qa = []
            for data in evaluation_data:
                question = data["user_prompt"]

                # 1. Search in the index
                search_response = search_documents(
                    search_index_name=search_index_name,
                    input=question,
                    embedding_function=embedding_function,
                )
                retrieved_sources = [os.path.normpath(response["source"])
                                   for response in search_response]
                retrieved_contexts = [response["chunkContext"]
                                   for response in search_response]
                retrieved_chunk_ids = [response["chunkId"]
                                   for response in search_response]
                # 2. Create prompt with the query and retrieved documents
                prompt = create_prompt(question, search_response)

                # 3. Call GPT-3 model to generate an answer
                # given the question and the retrieved documents
                response = call_llm(prompt)

                current_qa = {
                    "user_prompt": question,
                    "output_prompt": data["output_prompt"],
                    "context": data["context"],
                    "chunk_id": data["chunk_id"],
                    "source": os.path.normpath(data["source"]),
                    "root_chunk_id": data["root_chunk_id"],

                    "generated_output": response,
                    "retrieved_context": retrieved_contexts,
                    "retrieved_source": retrieved_sources,
                    "retrieved_chunk_id": retrieved_chunk_ids
                }

                generated_qa.append(current_qa)

            with open(path_to_output, "w") as f:
                json.dump(generated_qa, f)
    except Exception as e:
        print(e)

/Users/raouf/handsontest/ai-hands-on-lab/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

Generate answers using ADA + fixed size chunking#

Took 10 min

%run -i ./search.ipynb

evaluation_data_path = "../output/qa/evaluation/qa_pairs_solutionops.json"

search_index_name = "fixed-size-chunks-180-30-engineering-mlops-ada"
embedding_function = oai_query_embedding
path_to_output = "../output/qa/results/fixed-size-chunks-180-30-engineering-mlops-ada.json"

generate_answers_for_qa(evaluation_data_path, search_index_name, embedding_function, path_to_output)

QA already created at: ../output/qa/results/fixed-size-chunks-180-30-engineering-mlops-ada.json 

Generate answers using semantic chunking + open source embedding model#

# # 1. Create the new index
# # TODO: Replace this with a name for your new index
# index_name = "semantic-chunking-eval"
# vector_size = 384
# create_index(index_name, vector_size)

# # 2. Upload the embeddings to the new index
# upload_data(file_path=pregenerated_semantic_chunks_embeddings_os,
#             search_index_name=index_name)

import json
%run -i ./search.ipynb

evaluation_data_path = "../output/qa/evaluation/qa_pairs_solutionops.json"
search_index_name = "semantic-chunking-eval"
embedding_function = intfloat_e5_small_v2_query_embedding
path_to_output = "../output/qa/results/semantic-chunking-intfloat.json"

generate_answers_for_qa(evaluation_data_path, search_index_name, embedding_function, path_to_output)

QA already created at: ../output/qa/results/semantic-chunking-intfloat.json

Function to generate answers for each question in evaluation dataset

Contents

Function to generate answers for each question in evaluation dataset#

Generate answers using ADA + fixed size chunking#

Generate answers using semantic chunking + open source embedding model#