LLM-based Retrieval Augmented Generation#
In case text-embeddings perform poorly for identifying relevant documents, one can also ask LLMs to identify relevant documents. Therefore, we provide a list of files with corresponding summaries of these files and ask the LLM to tell us which documents are relevant. We then take the content of this document selection and assemble it to a long-context prompt.
from utilities import prompt_scadsai_llm, remove_outer_markdown, text_to_json
from IPython.display import display, Markdown
docs_root_folder = "hpc-compendium/doc.zih.tu-dresden.de/docs/"
compendium_url = "https://compendium.hpc.tu-dresden.de/"
This is again the question we aim to answer:
question = "How can I access the Jupyter Hub on the HPC system?"
Identifying relevant documents#
To identify relevant documents, we first load the summary list.
# Read the content of summaries.md
with open('hpc_compendium_summaries.md', 'r', encoding='utf-8') as f:
summaries = f.read()
# Print first 300 characters to verify
print("First part of the content:")
print(summaries[:700], "...")
First part of the content:
* accessibility.md:
This document is an accessibility statement for the Technische Universität Dresden's websites, outlining the university's efforts to make its online presence barrier-free in accordance with German law, and providing contact information for reporting accessibility issues and seeking redress.
* data_protection_declaration.md:
This document outlines a data protection policy, stating that only IP addresses are collected for error analysis and not shared with third parties unless required by law, and users have the right to request information about their personal data and contact relevant authorities.
* index.md:
This documentation provides information on the High-Performan ...
response = prompt_scadsai_llm(f"""
Given a question and a list of document summaries, identify documents that might be helpful for answering the question.
## Question
{question}
## Document summaries
{summaries}
## Your task:
Which of the documents above might be relevant for answering this question: {question}
Answer with a list of filenames in JSON format
""")
# post-processing of the result to get a proper list
json = remove_outer_markdown(response)
relevant_file_paths = text_to_json(json)
[print(f) for f in relevant_file_paths];
---------------------------------------------------------------------------
InternalServerError Traceback (most recent call last)
Cell In[6], line 1
----> 1 response = prompt_scadsai_llm(f"""
2 Given a question and a list of document summaries, identify documents that might be helpful for answering the question.
3
4 ## Question
5 {question}
6
7 ## Document summaries
8
9 {summaries}
10
11 ## Your task:
12 Which of the documents above might be relevant for answering this question: {question}
13
14 Answer with a list of filenames in JSON format
15 """)
17 # post-processing of the result to get a proper list
18 json = remove_outer_markdown(response)
File C:\structure\code\generative-ai-notebooks\docs\63_chat_with_docs\utilities.py:16, in prompt_scadsai_llm(message, model)
12 # setup connection to the LLM
13 client = openai.OpenAI(base_url="https://llm.scads.ai/v1",
14 api_key=os.environ.get('SCADSAI_API_KEY')
15 )
---> 16 response = client.chat.completions.create(
17 model=model,
18 messages=message
19 )
21 # extract answer
22 return response.choices[0].message.content
File ~\miniforge3\envs\genai-gpu\Lib\site-packages\openai\_utils\_utils.py:275, in required_args.<locals>.inner.<locals>.wrapper(*args, **kwargs)
273 msg = f"Missing required argument: {quote(missing[0])}"
274 raise TypeError(msg)
--> 275 return func(*args, **kwargs)
File ~\miniforge3\envs\genai-gpu\Lib\site-packages\openai\resources\chat\completions.py:859, in Completions.create(self, messages, model, audio, frequency_penalty, function_call, functions, logit_bias, logprobs, max_completion_tokens, max_tokens, metadata, modalities, n, parallel_tool_calls, prediction, presence_penalty, reasoning_effort, response_format, seed, service_tier, stop, store, stream, stream_options, temperature, tool_choice, tools, top_logprobs, top_p, user, extra_headers, extra_query, extra_body, timeout)
817 @required_args(["messages", "model"], ["messages", "model", "stream"])
818 def create(
819 self,
(...)
856 timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
857 ) -> ChatCompletion | Stream[ChatCompletionChunk]:
858 validate_response_format(response_format)
--> 859 return self._post(
860 "/chat/completions",
861 body=maybe_transform(
862 {
863 "messages": messages,
864 "model": model,
865 "audio": audio,
866 "frequency_penalty": frequency_penalty,
867 "function_call": function_call,
868 "functions": functions,
869 "logit_bias": logit_bias,
870 "logprobs": logprobs,
871 "max_completion_tokens": max_completion_tokens,
872 "max_tokens": max_tokens,
873 "metadata": metadata,
874 "modalities": modalities,
875 "n": n,
876 "parallel_tool_calls": parallel_tool_calls,
877 "prediction": prediction,
878 "presence_penalty": presence_penalty,
879 "reasoning_effort": reasoning_effort,
880 "response_format": response_format,
881 "seed": seed,
882 "service_tier": service_tier,
883 "stop": stop,
884 "store": store,
885 "stream": stream,
886 "stream_options": stream_options,
887 "temperature": temperature,
888 "tool_choice": tool_choice,
889 "tools": tools,
890 "top_logprobs": top_logprobs,
891 "top_p": top_p,
892 "user": user,
893 },
894 completion_create_params.CompletionCreateParams,
895 ),
896 options=make_request_options(
897 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
898 ),
899 cast_to=ChatCompletion,
900 stream=stream or False,
901 stream_cls=Stream[ChatCompletionChunk],
902 )
File ~\miniforge3\envs\genai-gpu\Lib\site-packages\openai\_base_client.py:1280, in SyncAPIClient.post(self, path, cast_to, body, options, files, stream, stream_cls)
1266 def post(
1267 self,
1268 path: str,
(...)
1275 stream_cls: type[_StreamT] | None = None,
1276 ) -> ResponseT | _StreamT:
1277 opts = FinalRequestOptions.construct(
1278 method="post", url=path, json_data=body, files=to_httpx_files(files), **options
1279 )
-> 1280 return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
File ~\miniforge3\envs\genai-gpu\Lib\site-packages\openai\_base_client.py:957, in SyncAPIClient.request(self, cast_to, options, remaining_retries, stream, stream_cls)
954 else:
955 retries_taken = 0
--> 957 return self._request(
958 cast_to=cast_to,
959 options=options,
960 stream=stream,
961 stream_cls=stream_cls,
962 retries_taken=retries_taken,
963 )
File ~\miniforge3\envs\genai-gpu\Lib\site-packages\openai\_base_client.py:1046, in SyncAPIClient._request(self, cast_to, options, retries_taken, stream, stream_cls)
1044 if remaining_retries > 0 and self._should_retry(err.response):
1045 err.response.close()
-> 1046 return self._retry_request(
1047 input_options,
1048 cast_to,
1049 retries_taken=retries_taken,
1050 response_headers=err.response.headers,
1051 stream=stream,
1052 stream_cls=stream_cls,
1053 )
1055 # If the response is streamed then we need to explicitly read the response
1056 # to completion before attempting to access the response text.
1057 if not err.response.is_closed:
File ~\miniforge3\envs\genai-gpu\Lib\site-packages\openai\_base_client.py:1095, in SyncAPIClient._retry_request(self, options, cast_to, retries_taken, response_headers, stream, stream_cls)
1091 # In a synchronous context we are blocking the entire thread. Up to the library user to run the client in a
1092 # different thread if necessary.
1093 time.sleep(timeout)
-> 1095 return self._request(
1096 options=options,
1097 cast_to=cast_to,
1098 retries_taken=retries_taken + 1,
1099 stream=stream,
1100 stream_cls=stream_cls,
1101 )
File ~\miniforge3\envs\genai-gpu\Lib\site-packages\openai\_base_client.py:1046, in SyncAPIClient._request(self, cast_to, options, retries_taken, stream, stream_cls)
1044 if remaining_retries > 0 and self._should_retry(err.response):
1045 err.response.close()
-> 1046 return self._retry_request(
1047 input_options,
1048 cast_to,
1049 retries_taken=retries_taken,
1050 response_headers=err.response.headers,
1051 stream=stream,
1052 stream_cls=stream_cls,
1053 )
1055 # If the response is streamed then we need to explicitly read the response
1056 # to completion before attempting to access the response text.
1057 if not err.response.is_closed:
File ~\miniforge3\envs\genai-gpu\Lib\site-packages\openai\_base_client.py:1095, in SyncAPIClient._retry_request(self, options, cast_to, retries_taken, response_headers, stream, stream_cls)
1091 # In a synchronous context we are blocking the entire thread. Up to the library user to run the client in a
1092 # different thread if necessary.
1093 time.sleep(timeout)
-> 1095 return self._request(
1096 options=options,
1097 cast_to=cast_to,
1098 retries_taken=retries_taken + 1,
1099 stream=stream,
1100 stream_cls=stream_cls,
1101 )
File ~\miniforge3\envs\genai-gpu\Lib\site-packages\openai\_base_client.py:1061, in SyncAPIClient._request(self, cast_to, options, retries_taken, stream, stream_cls)
1058 err.response.read()
1060 log.debug("Re-raising status error")
-> 1061 raise self._make_status_error_from_response(err.response) from None
1063 return self._process_response(
1064 cast_to=cast_to,
1065 options=options,
(...)
1069 retries_taken=retries_taken,
1070 )
InternalServerError: Error code: 500 - {'error': {'message': 'litellm.APIError: APIError: OpenAIException - Connection error.\nReceived Model Group=meta-llama/Llama-3.3-70B-Instruct\nAvailable Model Group Fallbacks=None\nError doing the fallback: list index out of range', 'type': None, 'param': None, 'code': '500'}}
full_texts = {}
for file in relevant_file_paths:
with open(docs_root_folder + file, 'r', encoding='utf-8') as f:
full_texts[compendium_url + file[:-3]] = f.read()
documents = "\n".join([f"### {file} \n\n```\n{content}\n```\n" for file, content in full_texts.items()])
documents[:500]
response = prompt_scadsai_llm(f"""
Given a question and a list of document summaries, identify documents that might be helpful for answering the question.
## Question
{question}
## Documents
{documents}
## Your task:
Answer question: {question}
In case you used one of the documents above, cite it using markdown-formatted links to the respective document. Keep the links untouched!
""")
display(Markdown(response))
Exercise#
Measure how long it takes to retrieve an answer using this approach, compared to long-context prompting.
Hint: Use the same LLM for both approaches. To do this with a length-limited LLM, you may have to shorten the full text.