headless_lms_chatbot/
content_cleaner.rs

1use crate::llm_utils::{Message, MessageRole, estimate_tokens, make_blocking_llm_request};
2use crate::prelude::*;
3use headless_lms_utils::document_schema_processor::GutenbergBlock;
4use tracing::{debug, error, info, instrument, warn};
5
6/// Maximum context window size for LLM in tokens
7pub const MAX_CONTEXT_WINDOW: i32 = 16000;
8/// Maximum percentage of context window to use in a single request
9pub const MAX_CONTEXT_UTILIZATION: f32 = 0.75;
10/// Temperature for requests, low for deterministic results
11pub const REQUEST_TEMPERATURE: f32 = 0.1;
12
13/// JSON markers for LLM prompt
14const JSON_BEGIN_MARKER: &str = "---BEGIN COURSE MATERIAL JSON---";
15const JSON_END_MARKER: &str = "---END COURSE MATERIAL JSON---";
16
17/// System prompt for converting course material to markdown
18const SYSTEM_PROMPT: &str = r#"You are given course material in an abstract JSON format from a headless CMS. Convert this into clean, semantic Markdown that includes all user-visible content to support full-text search.
19
20* Extract and include all meaningful text content: paragraphs, headings, list items, image captions, and similar.
21* Retain any inline formatting (like bold or italic text), converting HTML tags (`<strong>`, `<em>`, etc.) into equivalent Markdown formatting.
22* For images, use the standard Markdown format: `![caption](url)`, including a caption if available.
23* Preserve heading levels (e.g., level 2 → `##`, level 3 → `###`).
24* Include text content from any block type, even non-standard ones, if it appears user-visible.
25* For exercise blocks, include the exercise name, and assignment instructions. You may also include text from the exercise specification (public spec), if it can be formatted into markdown.
26* If you encounter blocks that don't have any visible text in the JSON but are likely still user-visible (placeholder blocks) — e.g. `glossary`, `exercises-in-this-chapter`, `course-progress` — generate a fake heading representing the expected content (e.g. `## Glossary`).
27* Do not generate headings for placeholder blocks that are not user-visible — e.g. `conditionally-visible-content`, `spacer`, `divider`.
28* Exclude all purely stylistic attributes (e.g. colors, alignment, font sizes).
29* Do not include any metadata, HTML tags (other than for formatting), or non-visible fields.
30* Output **only the Markdown content**, and nothing else.
31"#;
32
33/// User prompt for converting course material to markdown
34const USER_PROMPT_START: &str =
35    "Convert this JSON content to clean markdown. Output only the markdown, nothing else.";
36
37/// Cleans content by converting the material blocks to clean markdown using an LLM
38#[instrument(skip(blocks, app_config), fields(num_blocks = blocks.len()))]
39pub async fn convert_material_blocks_to_markdown_with_llm(
40    blocks: &[GutenbergBlock],
41    app_config: &ApplicationConfiguration,
42) -> anyhow::Result<String> {
43    debug!("Starting content conversion with {} blocks", blocks.len());
44    let system_message = Message {
45        role: MessageRole::System,
46        content: SYSTEM_PROMPT.to_string(),
47    };
48
49    let system_message_tokens = estimate_tokens(&system_message.content);
50    let safe_token_limit = calculate_safe_token_limit(MAX_CONTEXT_WINDOW, MAX_CONTEXT_UTILIZATION);
51    let max_content_tokens = safe_token_limit - system_message_tokens;
52
53    debug!(
54        "Token limits - system: {}, safe: {}, max content: {}",
55        system_message_tokens, safe_token_limit, max_content_tokens
56    );
57
58    let chunks = split_blocks_into_chunks(blocks, max_content_tokens)?;
59    debug!("Split content into {} chunks", chunks.len());
60    process_chunks(&chunks, &system_message, app_config).await
61}
62
63/// Calculate the safe token limit based on context window and utilization
64pub fn calculate_safe_token_limit(context_window: i32, utilization: f32) -> i32 {
65    (context_window as f32 * utilization) as i32
66}
67
68/// Split blocks into chunks that fit within token limits
69#[instrument(skip(blocks), fields(max_content_tokens))]
70pub fn split_blocks_into_chunks(
71    blocks: &[GutenbergBlock],
72    max_content_tokens: i32,
73) -> anyhow::Result<Vec<String>> {
74    debug!("Starting to split {} blocks into chunks", blocks.len());
75    let mut chunks: Vec<String> = Vec::new();
76    let mut current_chunk: Vec<GutenbergBlock> = Vec::new();
77    let mut current_chunk_tokens = 0;
78
79    for block in blocks {
80        let block_json = serde_json::to_string(block)?;
81        let block_tokens = estimate_tokens(&block_json);
82        debug!(
83            "Processing block {} with {} tokens",
84            block.client_id, block_tokens
85        );
86
87        // If this block alone exceeds the limit, split it into smaller chunks
88        if block_tokens > max_content_tokens {
89            warn!(
90                "Block {} exceeds max token limit ({} > {})",
91                block.client_id, block_tokens, max_content_tokens
92            );
93            // Add any accumulated blocks as a chunk
94            if !current_chunk.is_empty() {
95                chunks.push(serde_json::to_string(&current_chunk)?);
96                current_chunk = Vec::new();
97                current_chunk_tokens = 0;
98            }
99
100            // Then we do some crude splitting for the oversized block
101            split_oversized_block(&block_json, max_content_tokens, &mut chunks)?;
102            continue;
103        }
104
105        if current_chunk_tokens + block_tokens > max_content_tokens {
106            debug!(
107                "Creating new chunk after {} blocks ({} tokens)",
108                current_chunk.len(),
109                current_chunk_tokens
110            );
111            chunks.push(serde_json::to_string(&current_chunk)?);
112            current_chunk = Vec::new();
113            current_chunk_tokens = 0;
114        }
115
116        current_chunk.push(block.clone());
117        current_chunk_tokens += block_tokens;
118    }
119
120    if !current_chunk.is_empty() {
121        debug!(
122            "Adding final chunk with {} blocks ({} tokens)",
123            current_chunk.len(),
124            current_chunk_tokens
125        );
126        chunks.push(serde_json::to_string(&current_chunk)?);
127    }
128
129    Ok(chunks)
130}
131
132/// Splits an oversized block into smaller string chunks
133#[instrument(skip(block_json, chunks), fields(max_tokens))]
134fn split_oversized_block(
135    block_json: &str,
136    max_tokens: i32,
137    chunks: &mut Vec<String>,
138) -> anyhow::Result<()> {
139    let total_tokens = estimate_tokens(block_json);
140    debug!(
141        "Splitting oversized block with {} tokens into chunks of max {} tokens",
142        total_tokens, max_tokens
143    );
144
145    // Make a very conservative estimate of the number of chunks we need
146    let num_chunks = (total_tokens as f32 / (max_tokens as f32 * 0.5)).ceil() as usize;
147
148    if num_chunks <= 1 {
149        chunks.push(block_json.to_string());
150        return Ok(());
151    }
152
153    let chars_per_chunk = block_json.len() / num_chunks;
154    debug!(
155        "Splitting into {} chunks of approximately {} chars each",
156        num_chunks, chars_per_chunk
157    );
158
159    let mut start = 0;
160    while start < block_json.len() {
161        let end = if start + chars_per_chunk >= block_json.len() {
162            block_json.len()
163        } else {
164            start + chars_per_chunk
165        };
166
167        let chunk = &block_json[start..end];
168        chunks.push(chunk.to_string());
169
170        start = end;
171    }
172
173    Ok(())
174}
175
176/// Appends markdown content to a result string with proper newline separators
177pub fn append_markdown_with_separator(result: &mut String, new_content: &str) {
178    if !result.is_empty() && !result.ends_with("\n\n") {
179        if result.ends_with('\n') {
180            result.push('\n');
181        } else {
182            result.push_str("\n\n");
183        }
184    }
185
186    result.push_str(new_content);
187}
188
189/// Process all chunks and combine the results
190#[instrument(skip(chunks, system_message, app_config), fields(num_chunks = chunks.len()))]
191async fn process_chunks(
192    chunks: &[String],
193    system_message: &Message,
194    app_config: &ApplicationConfiguration,
195) -> anyhow::Result<String> {
196    debug!("Processing {} chunks", chunks.len());
197    let mut result = String::new();
198
199    for (i, chunk) in chunks.iter().enumerate() {
200        debug!("Processing chunk {}/{}", i + 1, chunks.len());
201        let chunk_markdown = process_block_chunk(chunk, system_message, app_config).await?;
202        append_markdown_with_separator(&mut result, &chunk_markdown);
203    }
204
205    info!("Successfully cleaned content with LLM");
206    Ok(result)
207}
208
209/// Process a subset of blocks in a single LLM request
210#[instrument(skip(chunk, system_message, app_config), fields(chunk_tokens = estimate_tokens(chunk)))]
211async fn process_block_chunk(
212    chunk: &str,
213    system_message: &Message,
214    app_config: &ApplicationConfiguration,
215) -> anyhow::Result<String> {
216    let messages = prepare_llm_messages(chunk, system_message)?;
217
218    info!(
219        "Processing chunk of approximately {} tokens",
220        estimate_tokens(chunk)
221    );
222
223    let completion =
224        match make_blocking_llm_request(messages, REQUEST_TEMPERATURE, None, app_config).await {
225            Ok(completion) => completion,
226            Err(e) => {
227                error!("Failed to process chunk: {}", e);
228                return Err(e);
229            }
230        };
231
232    let cleaned_content = completion
233        .choices
234        .first()
235        .ok_or_else(|| {
236            error!("No content returned from LLM");
237            anyhow::anyhow!("No content returned from LLM")
238        })?
239        .message
240        .content
241        .clone();
242
243    Ok(cleaned_content)
244}
245
246/// Prepare messages for the LLM request
247pub fn prepare_llm_messages(chunk: &str, system_message: &Message) -> anyhow::Result<Vec<Message>> {
248    let messages = vec![
249        system_message.clone(),
250        Message {
251            role: MessageRole::User,
252            content: format!(
253                "{}\n\n{}{}\n{}",
254                USER_PROMPT_START, JSON_BEGIN_MARKER, chunk, JSON_END_MARKER
255            ),
256        },
257    ];
258
259    Ok(messages)
260}
261
262#[cfg(test)]
263mod tests {
264    use super::*;
265    use serde_json::json;
266
267    const TEST_BLOCK_NAME: &str = "test/block";
268
269    #[test]
270    fn test_calculate_safe_token_limit() {
271        assert_eq!(calculate_safe_token_limit(1000, 0.75), 750);
272        assert_eq!(calculate_safe_token_limit(16000, 0.75), 12000);
273        assert_eq!(calculate_safe_token_limit(8000, 0.5), 4000);
274    }
275
276    #[test]
277    fn test_append_markdown_with_separator() {
278        let mut result = String::new();
279        append_markdown_with_separator(&mut result, "New content");
280        assert_eq!(result, "New content");
281
282        let mut result = String::from("Existing content");
283        append_markdown_with_separator(&mut result, "New content");
284        assert_eq!(result, "Existing content\n\nNew content");
285
286        let mut result = String::from("Existing content\n");
287        append_markdown_with_separator(&mut result, "New content");
288        assert_eq!(result, "Existing content\n\nNew content");
289
290        let mut result = String::from("Existing content\n\n");
291        append_markdown_with_separator(&mut result, "New content");
292        assert_eq!(result, "Existing content\n\nNew content");
293    }
294
295    #[test]
296    fn test_split_blocks_into_chunks() -> anyhow::Result<()> {
297        // Use content strings of different lengths to influence token estimation
298        let block1 = create_test_block("a "); // short
299        let block2 = create_test_block("b b b b b b b b b b b b b b b b b b b b "); // longer
300        let block3 = create_test_block("c c c c c c c c c c c c c c c "); // medium
301
302        let blocks = vec![block1.clone(), block2.clone(), block3.clone()];
303
304        // Estimate tokens for each block
305        let t1 = estimate_tokens(&serde_json::to_string(&block1)?);
306        let t2 = estimate_tokens(&serde_json::to_string(&block2)?);
307        let t3 = estimate_tokens(&serde_json::to_string(&block3)?);
308
309        // Test with a limit that fits all blocks
310        let chunks = split_blocks_into_chunks(&blocks, t1 + t2 + t3 + 10)?;
311        assert_eq!(chunks.len(), 1);
312
313        let deserialized_chunk: Vec<GutenbergBlock> = serde_json::from_str(&chunks[0])?;
314        assert_eq!(deserialized_chunk.len(), 3);
315
316        // Test with a limit that requires splitting after the first block
317        let chunks = split_blocks_into_chunks(&blocks, t1 + 1)?;
318
319        // First chunk should be a valid JSON array with one block
320        let first_chunk: Vec<GutenbergBlock> = serde_json::from_str(&chunks[0])?;
321        assert_eq!(first_chunk.len(), 1);
322        assert_eq!(first_chunk[0].client_id, block1.client_id);
323
324        // Remaining chunks might be split JSON strings, so we can't deserialize them
325        // Just verify they're not empty
326        for chunk in &chunks[1..] {
327            assert!(!chunk.is_empty());
328        }
329
330        Ok(())
331    }
332
333    #[test]
334    fn test_prepare_llm_messages() -> anyhow::Result<()> {
335        let blocks = vec![create_test_block("Test content")];
336        let blocks_json = serde_json::to_string(&blocks)?;
337        let system_message = Message {
338            role: MessageRole::System,
339            content: "System prompt".to_string(),
340        };
341
342        let messages = prepare_llm_messages(&blocks_json, &system_message)?;
343
344        assert_eq!(messages.len(), 2);
345        assert_eq!(messages[0].role, MessageRole::System);
346        assert_eq!(messages[0].content, "System prompt");
347        assert_eq!(messages[1].role, MessageRole::User);
348        assert!(messages[1].content.contains(JSON_BEGIN_MARKER));
349        assert!(messages[1].content.contains("Test content"));
350
351        Ok(())
352    }
353
354    fn create_test_block(content: &str) -> GutenbergBlock {
355        let client_id = uuid::Uuid::new_v4();
356        GutenbergBlock {
357            client_id,
358            name: TEST_BLOCK_NAME.to_string(),
359            is_valid: true,
360            attributes: {
361                let mut map = serde_json::Map::new();
362                map.insert("content".to_string(), json!(content));
363                map
364            },
365            inner_blocks: vec![],
366        }
367    }
368}