LLM-based Retrieval Augmented Generation#

In case text-embeddings perform poorly for identifying relevant documents, one can also ask LLMs to identify relevant documents. Therefore, we provide a list of files with corresponding summaries of these files and ask the LLM to tell us which documents are relevant. We then take the content of this document selection and assemble it to a long-context prompt.

from utilities import prompt_scadsai_llm, remove_outer_markdown, text_to_json
from IPython.display import display, Markdown
docs_root_folder = "hpc-compendium/doc.zih.tu-dresden.de/docs/"
compendium_url = "https://compendium.hpc.tu-dresden.de/"

This is again the question we aim to answer:

question = "How can I access the Jupyter Hub on the HPC system?"

Identifying relevant documents#

To identify relevant documents, we first load the summary list.

# Read the content of summaries.md 
with open('hpc_compendium_summaries.md', 'r', encoding='utf-8') as f:
    summaries = f.read()

# Print first 300 characters to verify
print("First part of the content:")
print(summaries[:700], "...")
First part of the content:
* accessibility.md:
This document is an accessibility statement for the Technische Universität Dresden's websites, outlining the university's efforts to make its online presence barrier-free in accordance with German law, and providing contact information for reporting accessibility issues and seeking redress.

* data_protection_declaration.md:
This document outlines a data protection policy, stating that only IP addresses are collected for error analysis and not shared with third parties unless required by law, and users have the right to request information about their personal data and contact relevant authorities.

* index.md:
This documentation provides information on the High-Performan ...
response = prompt_scadsai_llm(f"""
Given a question and a list of document summaries, identify documents that might be helpful for answering the question.

## Question
{question} 

## Document summaries

{summaries}

## Your task:
Which of the documents above might be relevant for answering this question: {question}

Answer with a list of filenames in JSON format
""")

# post-processing of the result to get a proper list
json = remove_outer_markdown(response)
relevant_file_paths = text_to_json(json)
[print(f) for f in relevant_file_paths];
---------------------------------------------------------------------------
InternalServerError                       Traceback (most recent call last)
Cell In[6], line 1
----> 1 response = prompt_scadsai_llm(f"""
      2 Given a question and a list of document summaries, identify documents that might be helpful for answering the question.
      3 
      4 ## Question
      5 {question} 
      6 
      7 ## Document summaries
      8 
      9 {summaries}
     10 
     11 ## Your task:
     12 Which of the documents above might be relevant for answering this question: {question}
     13 
     14 Answer with a list of filenames in JSON format
     15 """)
     17 # post-processing of the result to get a proper list
     18 json = remove_outer_markdown(response)

File C:\structure\code\generative-ai-notebooks\docs\63_chat_with_docs\utilities.py:16, in prompt_scadsai_llm(message, model)
     12 # setup connection to the LLM
     13 client = openai.OpenAI(base_url="https://llm.scads.ai/v1",
     14                        api_key=os.environ.get('SCADSAI_API_KEY')
     15 )
---> 16 response = client.chat.completions.create(
     17     model=model,
     18     messages=message
     19 )
     21 # extract answer
     22 return response.choices[0].message.content

File ~\miniforge3\envs\genai-gpu\Lib\site-packages\openai\_utils\_utils.py:275, in required_args.<locals>.inner.<locals>.wrapper(*args, **kwargs)
    273             msg = f"Missing required argument: {quote(missing[0])}"
    274     raise TypeError(msg)
--> 275 return func(*args, **kwargs)

File ~\miniforge3\envs\genai-gpu\Lib\site-packages\openai\resources\chat\completions.py:859, in Completions.create(self, messages, model, audio, frequency_penalty, function_call, functions, logit_bias, logprobs, max_completion_tokens, max_tokens, metadata, modalities, n, parallel_tool_calls, prediction, presence_penalty, reasoning_effort, response_format, seed, service_tier, stop, store, stream, stream_options, temperature, tool_choice, tools, top_logprobs, top_p, user, extra_headers, extra_query, extra_body, timeout)
    817 @required_args(["messages", "model"], ["messages", "model", "stream"])
    818 def create(
    819     self,
   (...)
    856     timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
    857 ) -> ChatCompletion | Stream[ChatCompletionChunk]:
    858     validate_response_format(response_format)
--> 859     return self._post(
    860         "/chat/completions",
    861         body=maybe_transform(
    862             {
    863                 "messages": messages,
    864                 "model": model,
    865                 "audio": audio,
    866                 "frequency_penalty": frequency_penalty,
    867                 "function_call": function_call,
    868                 "functions": functions,
    869                 "logit_bias": logit_bias,
    870                 "logprobs": logprobs,
    871                 "max_completion_tokens": max_completion_tokens,
    872                 "max_tokens": max_tokens,
    873                 "metadata": metadata,
    874                 "modalities": modalities,
    875                 "n": n,
    876                 "parallel_tool_calls": parallel_tool_calls,
    877                 "prediction": prediction,
    878                 "presence_penalty": presence_penalty,
    879                 "reasoning_effort": reasoning_effort,
    880                 "response_format": response_format,
    881                 "seed": seed,
    882                 "service_tier": service_tier,
    883                 "stop": stop,
    884                 "store": store,
    885                 "stream": stream,
    886                 "stream_options": stream_options,
    887                 "temperature": temperature,
    888                 "tool_choice": tool_choice,
    889                 "tools": tools,
    890                 "top_logprobs": top_logprobs,
    891                 "top_p": top_p,
    892                 "user": user,
    893             },
    894             completion_create_params.CompletionCreateParams,
    895         ),
    896         options=make_request_options(
    897             extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
    898         ),
    899         cast_to=ChatCompletion,
    900         stream=stream or False,
    901         stream_cls=Stream[ChatCompletionChunk],
    902     )

File ~\miniforge3\envs\genai-gpu\Lib\site-packages\openai\_base_client.py:1280, in SyncAPIClient.post(self, path, cast_to, body, options, files, stream, stream_cls)
   1266 def post(
   1267     self,
   1268     path: str,
   (...)
   1275     stream_cls: type[_StreamT] | None = None,
   1276 ) -> ResponseT | _StreamT:
   1277     opts = FinalRequestOptions.construct(
   1278         method="post", url=path, json_data=body, files=to_httpx_files(files), **options
   1279     )
-> 1280     return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))

File ~\miniforge3\envs\genai-gpu\Lib\site-packages\openai\_base_client.py:957, in SyncAPIClient.request(self, cast_to, options, remaining_retries, stream, stream_cls)
    954 else:
    955     retries_taken = 0
--> 957 return self._request(
    958     cast_to=cast_to,
    959     options=options,
    960     stream=stream,
    961     stream_cls=stream_cls,
    962     retries_taken=retries_taken,
    963 )

File ~\miniforge3\envs\genai-gpu\Lib\site-packages\openai\_base_client.py:1046, in SyncAPIClient._request(self, cast_to, options, retries_taken, stream, stream_cls)
   1044 if remaining_retries > 0 and self._should_retry(err.response):
   1045     err.response.close()
-> 1046     return self._retry_request(
   1047         input_options,
   1048         cast_to,
   1049         retries_taken=retries_taken,
   1050         response_headers=err.response.headers,
   1051         stream=stream,
   1052         stream_cls=stream_cls,
   1053     )
   1055 # If the response is streamed then we need to explicitly read the response
   1056 # to completion before attempting to access the response text.
   1057 if not err.response.is_closed:

File ~\miniforge3\envs\genai-gpu\Lib\site-packages\openai\_base_client.py:1095, in SyncAPIClient._retry_request(self, options, cast_to, retries_taken, response_headers, stream, stream_cls)
   1091 # In a synchronous context we are blocking the entire thread. Up to the library user to run the client in a
   1092 # different thread if necessary.
   1093 time.sleep(timeout)
-> 1095 return self._request(
   1096     options=options,
   1097     cast_to=cast_to,
   1098     retries_taken=retries_taken + 1,
   1099     stream=stream,
   1100     stream_cls=stream_cls,
   1101 )

File ~\miniforge3\envs\genai-gpu\Lib\site-packages\openai\_base_client.py:1046, in SyncAPIClient._request(self, cast_to, options, retries_taken, stream, stream_cls)
   1044 if remaining_retries > 0 and self._should_retry(err.response):
   1045     err.response.close()
-> 1046     return self._retry_request(
   1047         input_options,
   1048         cast_to,
   1049         retries_taken=retries_taken,
   1050         response_headers=err.response.headers,
   1051         stream=stream,
   1052         stream_cls=stream_cls,
   1053     )
   1055 # If the response is streamed then we need to explicitly read the response
   1056 # to completion before attempting to access the response text.
   1057 if not err.response.is_closed:

File ~\miniforge3\envs\genai-gpu\Lib\site-packages\openai\_base_client.py:1095, in SyncAPIClient._retry_request(self, options, cast_to, retries_taken, response_headers, stream, stream_cls)
   1091 # In a synchronous context we are blocking the entire thread. Up to the library user to run the client in a
   1092 # different thread if necessary.
   1093 time.sleep(timeout)
-> 1095 return self._request(
   1096     options=options,
   1097     cast_to=cast_to,
   1098     retries_taken=retries_taken + 1,
   1099     stream=stream,
   1100     stream_cls=stream_cls,
   1101 )

File ~\miniforge3\envs\genai-gpu\Lib\site-packages\openai\_base_client.py:1061, in SyncAPIClient._request(self, cast_to, options, retries_taken, stream, stream_cls)
   1058         err.response.read()
   1060     log.debug("Re-raising status error")
-> 1061     raise self._make_status_error_from_response(err.response) from None
   1063 return self._process_response(
   1064     cast_to=cast_to,
   1065     options=options,
   (...)
   1069     retries_taken=retries_taken,
   1070 )

InternalServerError: Error code: 500 - {'error': {'message': 'litellm.APIError: APIError: OpenAIException - Connection error.\nReceived Model Group=meta-llama/Llama-3.3-70B-Instruct\nAvailable Model Group Fallbacks=None\nError doing the fallback: list index out of range', 'type': None, 'param': None, 'code': '500'}}
full_texts = {}
for file in relevant_file_paths:
    with open(docs_root_folder + file, 'r', encoding='utf-8') as f:
        full_texts[compendium_url + file[:-3]] = f.read()


documents = "\n".join([f"### {file} \n\n```\n{content}\n```\n" for file, content in full_texts.items()])

documents[:500]
response = prompt_scadsai_llm(f"""
Given a question and a list of document summaries, identify documents that might be helpful for answering the question.

## Question
{question} 

## Documents

{documents}

## Your task:
Answer question: {question}
In case you used one of the documents above, cite it using markdown-formatted links to the respective document. Keep the links untouched!
""")

display(Markdown(response))

Exercise#

Measure how long it takes to retrieve an answer using this approach, compared to long-context prompting.

Hint: Use the same LLM for both approaches. To do this with a length-limited LLM, you may have to shorten the full text.