headless_lms_chatbot/
content_cleaner.rs

1use crate::azure_chatbot::{LLMRequest, LLMRequestParams, NonThinkingParams};
2use crate::llm_utils::{APIMessage, MessageRole, estimate_tokens, make_blocking_llm_request};
3use crate::prelude::*;
4use headless_lms_utils::document_schema_processor::GutenbergBlock;
5use serde_json::Value;
6use tracing::{debug, error, info, instrument, warn};
7
8/// Maximum context window size for LLM in tokens
9pub const MAX_CONTEXT_WINDOW: i32 = 16000;
10/// Maximum percentage of context window to use in a single request
11pub const MAX_CONTEXT_UTILIZATION: f32 = 0.75;
12/// Temperature for requests, low for deterministic results
13pub const REQUEST_TEMPERATURE: f32 = 0.1;
14
15/// JSON markers for LLM prompt
16const JSON_BEGIN_MARKER: &str = "---BEGIN COURSE MATERIAL JSON---";
17const JSON_END_MARKER: &str = "---END COURSE MATERIAL JSON---";
18
19/// System prompt for converting course material to markdown
20const SYSTEM_PROMPT: &str = r#"You are given course material in an abstract JSON format from a headless CMS. Convert this into clean, semantic Markdown that includes all user-visible content to support full-text search.
21
22* Extract and include all meaningful text content: paragraphs, headings, list items, image captions, and similar.
23* Retain any inline formatting (like bold or italic text), converting HTML tags (`<strong>`, `<em>`, etc.) into equivalent Markdown formatting.
24* For images, use the standard Markdown format: `![caption](url)`, including a caption if available.
25* Preserve heading levels (e.g., level 2 → `##`, level 3 → `###`).
26* Include text content from any block type, even non-standard ones, if it appears user-visible.
27* For exercise blocks, include the exercise name, and assignment instructions. You may also include text from the exercise specification (public spec), if it can be formatted into markdown.
28* If you encounter blocks that don't have any visible text in the JSON but are likely still user-visible (placeholder blocks) — e.g. `glossary`, `exercises-in-this-chapter`, `course-progress` — generate a fake heading representing the expected content (e.g. `## Glossary`).
29* Do not generate headings for placeholder blocks that are not user-visible — e.g. `conditionally-visible-content`, `spacer`, `divider`.
30* Exclude all purely stylistic attributes (e.g. colors, alignment, font sizes).
31* Do not include any metadata, HTML tags (other than for formatting), or non-visible fields.
32* Output **only the Markdown content**, and nothing else.
33"#;
34
35/// User prompt for converting course material to markdown
36const USER_PROMPT_START: &str =
37    "Convert this JSON content to clean markdown. Output only the markdown, nothing else.";
38
39/// Cleans content by converting the material blocks to clean markdown using an LLM
40#[instrument(skip(blocks, app_config), fields(num_blocks = blocks.len()))]
41pub async fn convert_material_blocks_to_markdown_with_llm(
42    blocks: &[GutenbergBlock],
43    app_config: &ApplicationConfiguration,
44) -> anyhow::Result<String> {
45    debug!("Starting content conversion with {} blocks", blocks.len());
46    let system_message = APIMessage {
47        role: MessageRole::System,
48        content: SYSTEM_PROMPT.to_string(),
49    };
50
51    let system_message_tokens = estimate_tokens(&system_message.content);
52    let safe_token_limit = calculate_safe_token_limit(MAX_CONTEXT_WINDOW, MAX_CONTEXT_UTILIZATION);
53    let max_content_tokens = (safe_token_limit - system_message_tokens).max(1);
54
55    debug!(
56        "Token limits - system: {}, safe: {}, max content: {}",
57        system_message_tokens, safe_token_limit, max_content_tokens
58    );
59
60    let chunks = split_blocks_into_chunks(blocks, max_content_tokens)?;
61    debug!("Split content into {} chunks", chunks.len());
62    process_chunks(&chunks, &system_message, app_config).await
63}
64
65/// Calculate the safe token limit based on context window and utilization
66pub fn calculate_safe_token_limit(context_window: i32, utilization: f32) -> i32 {
67    (context_window as f32 * utilization) as i32
68}
69
70/// Recursively removes all fields named "private_spec" from a JSON value
71fn remove_private_spec_recursive(value: &mut Value) {
72    match value {
73        Value::Object(map) => {
74            map.remove("private_spec");
75            for (_, v) in map.iter_mut() {
76                remove_private_spec_recursive(v);
77            }
78        }
79        Value::Array(arr) => {
80            for item in arr.iter_mut() {
81                remove_private_spec_recursive(item);
82            }
83        }
84        _ => {}
85    }
86}
87
88/// Converts a block to JSON string, removing any private_spec fields recursively
89fn block_to_json_string(block: &GutenbergBlock) -> anyhow::Result<String> {
90    let mut json_value = serde_json::to_value(block)?;
91    remove_private_spec_recursive(&mut json_value);
92    Ok(serde_json::to_string(&json_value)?)
93}
94
95/// Converts a vector of blocks to JSON string, removing any private_spec fields recursively
96fn blocks_to_json_string(blocks: &[GutenbergBlock]) -> anyhow::Result<String> {
97    let mut json_value = serde_json::to_value(blocks)?;
98    remove_private_spec_recursive(&mut json_value);
99    Ok(serde_json::to_string(&json_value)?)
100}
101
102/// Split blocks into chunks that fit within token limits
103#[instrument(skip(blocks), fields(max_content_tokens))]
104pub fn split_blocks_into_chunks(
105    blocks: &[GutenbergBlock],
106    max_content_tokens: i32,
107) -> anyhow::Result<Vec<String>> {
108    debug!("Starting to split {} blocks into chunks", blocks.len());
109    let mut chunks: Vec<String> = Vec::new();
110    let mut current_chunk: Vec<GutenbergBlock> = Vec::new();
111    let mut current_chunk_tokens = 0;
112
113    for block in blocks {
114        let block_json = block_to_json_string(block)?;
115        let block_tokens = estimate_tokens(&block_json);
116        debug!(
117            "Processing block {} with {} tokens",
118            block.client_id, block_tokens
119        );
120
121        // If this block alone exceeds the limit, split it into smaller chunks
122        if block_tokens > max_content_tokens {
123            warn!(
124                "Block {} exceeds max token limit ({} > {})",
125                block.client_id, block_tokens, max_content_tokens
126            );
127            // Add any accumulated blocks as a chunk
128            if !current_chunk.is_empty() {
129                chunks.push(blocks_to_json_string(&current_chunk)?);
130                current_chunk = Vec::new();
131                current_chunk_tokens = 0;
132            }
133
134            // Then we do some crude splitting for the oversized block
135            split_oversized_block(&block_json, max_content_tokens, &mut chunks)?;
136            continue;
137        }
138
139        if current_chunk_tokens + block_tokens > max_content_tokens {
140            debug!(
141                "Creating new chunk after {} blocks ({} tokens)",
142                current_chunk.len(),
143                current_chunk_tokens
144            );
145            chunks.push(blocks_to_json_string(&current_chunk)?);
146            current_chunk = Vec::new();
147            current_chunk_tokens = 0;
148        }
149
150        current_chunk.push(block.clone());
151        current_chunk_tokens += block_tokens;
152    }
153
154    if !current_chunk.is_empty() {
155        debug!(
156            "Adding final chunk with {} blocks ({} tokens)",
157            current_chunk.len(),
158            current_chunk_tokens
159        );
160        chunks.push(blocks_to_json_string(&current_chunk)?);
161    }
162
163    Ok(chunks)
164}
165
166/// Splits an oversized block into smaller string chunks
167#[instrument(skip(block_json, chunks), fields(max_tokens))]
168fn split_oversized_block(
169    block_json: &str,
170    max_tokens: i32,
171    chunks: &mut Vec<String>,
172) -> anyhow::Result<()> {
173    let total_tokens = estimate_tokens(block_json);
174    debug!(
175        "Splitting oversized block with {} tokens into chunks of max {} tokens",
176        total_tokens, max_tokens
177    );
178
179    // Make a very conservative estimate of the number of chunks we need
180    // Ensure max_tokens is at least 1 to avoid division by zero
181    let max_tokens_safe = max_tokens.max(1);
182    let num_chunks = (total_tokens as f32 / (max_tokens_safe as f32 * 0.5)).ceil() as usize;
183
184    if num_chunks <= 1 || num_chunks == 0 {
185        chunks.push(block_json.to_string());
186        return Ok(());
187    }
188
189    // Split by byte length (not character count) for efficiency,
190    // but ensure we only slice at UTF-8 character boundaries
191    let bytes_per_chunk = (block_json.len() / num_chunks).max(1);
192    debug!(
193        "Splitting into {} chunks of approximately {} bytes each",
194        num_chunks, bytes_per_chunk
195    );
196
197    let mut start = 0;
198    let mut iterations = 0;
199    const MAX_ITERATIONS: usize = 100;
200    while start < block_json.len() {
201        iterations += 1;
202        if iterations > MAX_ITERATIONS {
203            return Err(anyhow::anyhow!(
204                "Infinite loop protection: exceeded {} iterations in split_oversized_block",
205                MAX_ITERATIONS
206            ));
207        }
208
209        // Use checked arithmetic to prevent overflow
210        let end_candidate = start
211            .checked_add(bytes_per_chunk)
212            .unwrap_or(block_json.len())
213            .min(block_json.len());
214
215        let mut end = if end_candidate >= block_json.len() {
216            block_json.len()
217        } else {
218            end_candidate
219        };
220
221        // Adjust end backwards to the nearest UTF-8 character boundary
222        while !block_json.is_char_boundary(end) && end > start {
223            end -= 1;
224        }
225
226        // If backtracking resulted in end == start, advance forward to next boundary
227        if end == start {
228            // Find the next character boundary after start
229            let mut next_boundary = start
230                .checked_add(1)
231                .unwrap_or(block_json.len())
232                .min(block_json.len());
233
234            let mut boundary_iterations = 0;
235            const MAX_BOUNDARY_ITERATIONS: usize = 100;
236            while next_boundary < block_json.len() && !block_json.is_char_boundary(next_boundary) {
237                boundary_iterations += 1;
238                if boundary_iterations > MAX_BOUNDARY_ITERATIONS {
239                    return Err(anyhow::anyhow!(
240                        "Infinite loop protection: exceeded {} iterations finding character boundary",
241                        MAX_BOUNDARY_ITERATIONS
242                    ));
243                }
244                next_boundary = next_boundary
245                    .checked_add(1)
246                    .unwrap_or(block_json.len())
247                    .min(block_json.len());
248            }
249            end = next_boundary.min(block_json.len());
250        }
251
252        // Ensure we have a non-empty slice and valid bounds
253        if end > start && end <= block_json.len() && start < block_json.len() {
254            // Double-check bounds before slicing
255            let chunk = block_json.get(start..end).ok_or_else(|| {
256                anyhow::anyhow!("Invalid string slice bounds: {}..{}", start, end)
257            })?;
258            chunks.push(chunk.to_string());
259            let new_start = end;
260            // Safety check: ensure start always advances
261            if new_start <= start {
262                return Err(anyhow::anyhow!(
263                    "Infinite loop protection: start did not advance ({} -> {})",
264                    start,
265                    new_start
266                ));
267            }
268            start = new_start;
269        } else {
270            // Safety: if we can't make progress, break to avoid infinite loop
271            // Push remaining content if any
272            if start < block_json.len() {
273                if let Some(remaining) = block_json.get(start..) {
274                    if !remaining.is_empty() {
275                        chunks.push(remaining.to_string());
276                    }
277                }
278            }
279            break;
280        }
281    }
282
283    Ok(())
284}
285
286/// Appends markdown content to a result string with proper newline separators
287pub fn append_markdown_with_separator(result: &mut String, new_content: &str) {
288    if !result.is_empty() && !result.ends_with("\n\n") {
289        if result.ends_with('\n') {
290            result.push('\n');
291        } else {
292            result.push_str("\n\n");
293        }
294    }
295
296    result.push_str(new_content);
297}
298
299/// Process all chunks and combine the results
300#[instrument(skip(chunks, system_message, app_config), fields(num_chunks = chunks.len()))]
301async fn process_chunks(
302    chunks: &[String],
303    system_message: &APIMessage,
304    app_config: &ApplicationConfiguration,
305) -> anyhow::Result<String> {
306    debug!("Processing {} chunks", chunks.len());
307    let mut result = String::new();
308
309    for (i, chunk) in chunks.iter().enumerate() {
310        debug!("Processing chunk {}/{}", i + 1, chunks.len());
311        let chunk_markdown = process_block_chunk(chunk, system_message, app_config).await?;
312        append_markdown_with_separator(&mut result, &chunk_markdown);
313    }
314
315    info!("Successfully cleaned content with LLM");
316    Ok(result)
317}
318
319/// Process a subset of blocks in a single LLM request
320#[instrument(skip(chunk, system_message, app_config), fields(chunk_tokens = estimate_tokens(chunk)))]
321async fn process_block_chunk(
322    chunk: &str,
323    system_message: &APIMessage,
324    app_config: &ApplicationConfiguration,
325) -> anyhow::Result<String> {
326    let messages = prepare_llm_messages(chunk, system_message)?;
327    let llm_base_request: LLMRequest = LLMRequest {
328        messages,
329        data_sources: vec![],
330        params: LLMRequestParams::NonThinking(NonThinkingParams {
331            temperature: Some(REQUEST_TEMPERATURE),
332            top_p: None,
333            frequency_penalty: None,
334            presence_penalty: None,
335            max_tokens: None,
336        }),
337        stop: None,
338    };
339    info!(
340        "Processing chunk of approximately {} tokens",
341        estimate_tokens(chunk)
342    );
343
344    let completion = match make_blocking_llm_request(llm_base_request, app_config).await {
345        Ok(completion) => completion,
346        Err(e) => {
347            error!("Failed to process chunk: {}", e);
348            return Err(e);
349        }
350    };
351
352    let cleaned_content = completion
353        .choices
354        .first()
355        .ok_or_else(|| {
356            error!("No content returned from LLM");
357            anyhow::anyhow!("No content returned from LLM")
358        })?
359        .message
360        .content
361        .clone();
362
363    Ok(cleaned_content)
364}
365
366/// Prepare messages for the LLM request
367pub fn prepare_llm_messages(
368    chunk: &str,
369    system_message: &APIMessage,
370) -> anyhow::Result<Vec<APIMessage>> {
371    let messages = vec![
372        system_message.clone(),
373        APIMessage {
374            role: MessageRole::User,
375            content: format!(
376                "{}\n\n{}{}\n{}",
377                USER_PROMPT_START, JSON_BEGIN_MARKER, chunk, JSON_END_MARKER
378            ),
379        },
380    ];
381
382    Ok(messages)
383}
384
385#[cfg(test)]
386mod tests {
387    use super::*;
388    use serde_json::json;
389
390    const TEST_BLOCK_NAME: &str = "test/block";
391
392    #[test]
393    fn test_calculate_safe_token_limit() {
394        assert_eq!(calculate_safe_token_limit(1000, 0.75), 750);
395        assert_eq!(calculate_safe_token_limit(16000, 0.75), 12000);
396        assert_eq!(calculate_safe_token_limit(8000, 0.5), 4000);
397    }
398
399    #[test]
400    fn test_append_markdown_with_separator() {
401        let mut result = String::new();
402        append_markdown_with_separator(&mut result, "New content");
403        assert_eq!(result, "New content");
404
405        let mut result = String::from("Existing content");
406        append_markdown_with_separator(&mut result, "New content");
407        assert_eq!(result, "Existing content\n\nNew content");
408
409        let mut result = String::from("Existing content\n");
410        append_markdown_with_separator(&mut result, "New content");
411        assert_eq!(result, "Existing content\n\nNew content");
412
413        let mut result = String::from("Existing content\n\n");
414        append_markdown_with_separator(&mut result, "New content");
415        assert_eq!(result, "Existing content\n\nNew content");
416    }
417
418    #[test]
419    fn test_split_blocks_into_chunks() -> anyhow::Result<()> {
420        // Use content strings of different lengths to influence token estimation
421        let block1 = create_test_block("a "); // short
422        let block2 = create_test_block("b b b b b b b b b b b b b b b b b b b b "); // longer
423        let block3 = create_test_block("c c c c c c c c c c c c c c c "); // medium
424
425        let blocks = vec![block1.clone(), block2.clone(), block3.clone()];
426
427        // Estimate tokens for each block
428        let t1 = estimate_tokens(&block_to_json_string(&block1)?);
429        let t2 = estimate_tokens(&block_to_json_string(&block2)?);
430        let t3 = estimate_tokens(&block_to_json_string(&block3)?);
431
432        // Test with a limit that fits all blocks
433        let chunks = split_blocks_into_chunks(&blocks, t1 + t2 + t3 + 10)?;
434        assert_eq!(chunks.len(), 1);
435
436        let deserialized_chunk: Vec<GutenbergBlock> = serde_json::from_str(&chunks[0])?;
437        assert_eq!(deserialized_chunk.len(), 3);
438
439        // Test with a limit that requires splitting after the first block
440        let chunks = split_blocks_into_chunks(&blocks, t1 + 1)?;
441
442        // First chunk should be a valid JSON array with one block
443        let first_chunk: Vec<GutenbergBlock> = serde_json::from_str(&chunks[0])?;
444        assert_eq!(first_chunk.len(), 1);
445        assert_eq!(first_chunk[0].client_id, block1.client_id);
446
447        // Remaining chunks might be split JSON strings, so we can't deserialize them
448        // Just verify they're not empty
449        for chunk in &chunks[1..] {
450            assert!(!chunk.is_empty());
451        }
452
453        Ok(())
454    }
455
456    #[test]
457    fn test_prepare_llm_messages() -> anyhow::Result<()> {
458        let blocks = vec![create_test_block("Test content")];
459        let blocks_json = blocks_to_json_string(&blocks)?;
460        let system_message = APIMessage {
461            role: MessageRole::System,
462            content: "System prompt".to_string(),
463        };
464
465        let messages = prepare_llm_messages(&blocks_json, &system_message)?;
466
467        assert_eq!(messages.len(), 2);
468        assert_eq!(messages[0].role, MessageRole::System);
469        assert_eq!(messages[0].content, "System prompt");
470        assert_eq!(messages[1].role, MessageRole::User);
471        assert!(messages[1].content.contains(JSON_BEGIN_MARKER));
472        assert!(messages[1].content.contains("Test content"));
473
474        Ok(())
475    }
476
477    fn create_test_block(content: &str) -> GutenbergBlock {
478        let client_id = uuid::Uuid::new_v4();
479        GutenbergBlock {
480            client_id,
481            name: TEST_BLOCK_NAME.to_string(),
482            is_valid: true,
483            attributes: {
484                let mut map = serde_json::Map::new();
485                map.insert("content".to_string(), json!(content));
486                map
487            },
488            inner_blocks: vec![],
489        }
490    }
491}