Skip to main content

headless_lms_chatbot/
citations.rs

1use std::path::PathBuf;
2
3use secrecy::SecretString;
4
5use crate::{llm_utils::build_llm_headers, prelude::*};
6
7use headless_lms_models::chatbot_conversation_messages_citations::{
8    self, ChatbotConversationMessageCitation,
9};
10use headless_lms_utils::strings::truncate_utf8_at_boundary;
11use headless_lms_utils::url_encoding::url_decode;
12use reqwest::Response;
13use serde::{Deserialize, Serialize};
14use tracing::{error, instrument, trace};
15use url::Url;
16
17#[derive(Serialize, Deserialize, Debug, Clone)]
18pub struct CourseMaterialDocument {
19    pub chunk_id: String,
20    pub chunk: String,
21    pub title: String,
22    pub url: String,
23    pub filepath: String,
24}
25
26impl CourseMaterialDocument {
27    /// Converts the document to citation. Returns also the page_id of the cited document
28    /// so we can get the correct chapter_number later.
29    pub fn to_chatbot_conversation_message_citation(
30        &self,
31        conversation_message_id: Uuid,
32        conversation_id: Uuid,
33        citation_number: i32,
34    ) -> ChatbotResult<(ChatbotConversationMessageCitation, Option<Uuid>)> {
35        // Shorten the content if needed
36        let content = if self.chunk.len() < 255 {
37            self.chunk.clone()
38        } else {
39            truncate_utf8_at_boundary(&self.chunk, 255).to_string()
40        };
41
42        // The title and URL come from Azure Blob Storage metadata, which was URL-encoded
43        // (percent-encoded) because Azure Blob Storage metadata values must be ASCII-only.
44        // We decode them back to their original UTF-8 strings before storing in the database.
45        let decoded_title = url_decode(&self.title)?;
46        let decoded_url = url_decode(&self.url)?;
47
48        // Get the page id
49        let mut page_path = PathBuf::from(&self.filepath);
50        page_path.set_extension("");
51        let page_id_str = page_path.file_name();
52        let page_id =
53            page_id_str.and_then(|id_str| Uuid::parse_str(id_str.to_string_lossy().as_ref()).ok());
54
55        Ok((
56            ChatbotConversationMessageCitation {
57                conversation_message_id,
58                conversation_id,
59                title: decoded_title,
60                content,
61                document_url: decoded_url,
62                citation_number,
63                ..Default::default()
64            },
65            page_id,
66        ))
67    }
68}
69
70/// Get documents cited by the chatbot from the search index and save them
71/// as chatbot_conversation_message_citations into the database
72pub async fn chatbot_cited_documents_to_citations(
73    conn: &mut PgConnection,
74    test_chatbot: bool,
75    mut document_urls: Vec<Url>,
76    api_key: &SecretString,
77    conversation_message_id: Uuid,
78    conversation_id: Uuid,
79) -> anyhow::Result<Vec<ChatbotConversationMessageCitation>> {
80    let mut documents: Vec<(CourseMaterialDocument, i32)> = vec![];
81    for (idx, url) in document_urls.iter_mut().enumerate() {
82        let document = get_course_material_document(url, api_key).await?;
83        let citation_number = (idx + 1) as i32;
84        documents.push((document, citation_number));
85    }
86    let res = save_documents(
87        conn,
88        test_chatbot,
89        documents,
90        conversation_message_id,
91        conversation_id,
92    )
93    .await?;
94
95    Ok(res)
96}
97
98/// Get a document from the search index with a LLM-provided get url
99async fn get_course_material_document(
100    endpoint: &mut Url,
101    api_key: &SecretString,
102) -> anyhow::Result<CourseMaterialDocument> {
103    endpoint.set_query(Some(
104        "api-version=2024-07-01&$select=chunk_id,parent_id,chunk,title,url,filepath,course_id",
105    ));
106    let headers = build_llm_headers(api_key)?;
107
108    let response = REQWEST_CLIENT
109        .get(endpoint.clone())
110        .headers(headers)
111        .send()
112        .await?;
113
114    process_course_material_document_response(response).await
115}
116
117#[instrument(skip(response), fields(status = %response.status()))]
118async fn process_course_material_document_response(
119    response: Response,
120) -> anyhow::Result<CourseMaterialDocument> {
121    if !response.status().is_success() {
122        let status = response.status();
123        let error_text = response.text().await?;
124        error!(
125            status = %status,
126            error = %error_text,
127            "Error fetching document from search index."
128        );
129        return Err(anyhow::anyhow!(
130            "Error fetching document from search index: Status: {}. Error: {}",
131            status,
132            error_text
133        ));
134    }
135
136    trace!("Processing successful LLM response");
137    // Parse the response
138    let document: CourseMaterialDocument = response.json().await?;
139
140    Ok(document)
141}
142
143/// Save a course material document into the database as a citation
144async fn save_documents(
145    conn: &mut PgConnection,
146    test_chatbot: bool,
147    documents_with_citation_numbers: Vec<(CourseMaterialDocument, i32)>,
148    conversation_message_id: Uuid,
149    conversation_id: Uuid,
150) -> anyhow::Result<Vec<ChatbotConversationMessageCitation>> {
151    let (citations, page_ids): (Vec<ChatbotConversationMessageCitation>, Vec<Option<Uuid>>) =
152        documents_with_citation_numbers
153            .iter()
154            .map(|(d, citation_number)| {
155                d.to_chatbot_conversation_message_citation(
156                    conversation_message_id,
157                    conversation_id,
158                    citation_number.to_owned(),
159                )
160            })
161            .collect::<ChatbotResult<Vec<(ChatbotConversationMessageCitation, Option<Uuid>)>>>()?
162            .into_iter()
163            .unzip();
164    if test_chatbot {
165        return save_documents_mock(conn, citations).await;
166    };
167    let res =
168        chatbot_conversation_messages_citations::insert_batch(conn, citations, page_ids).await?;
169
170    Ok(res)
171}
172
173async fn save_documents_mock(
174    conn: &mut PgConnection,
175    citations: Vec<ChatbotConversationMessageCitation>,
176) -> anyhow::Result<Vec<ChatbotConversationMessageCitation>> {
177    let mut res = vec![];
178    for input in citations {
179        let a = chatbot_conversation_messages_citations::insert(conn, input).await?;
180        res.push(a)
181    }
182    Ok(res)
183}