headless_lms_chatbot/
content_cleaner.rs

1use crate::azure_chatbot::{LLMRequest, LLMRequestParams, NonThinkingParams, ThinkingParams};
2use crate::llm_utils::{
3    APIMessage, APIMessageKind, APIMessageText, estimate_tokens, make_blocking_llm_request,
4    parse_text_completion,
5};
6use crate::prelude::*;
7use headless_lms_models::application_task_default_language_models::TaskLMSpec;
8use headless_lms_models::chatbot_conversation_messages::MessageRole;
9use headless_lms_utils::document_schema_processor::GutenbergBlock;
10use serde_json::Value;
11use tracing::{debug, error, info, instrument, warn};
12
13/// Temperature for requests, low for deterministic results
14pub const REQUEST_TEMPERATURE: f32 = 0.1;
15
16/// JSON markers for LLM prompt
17const JSON_BEGIN_MARKER: &str = "---BEGIN COURSE MATERIAL JSON---";
18const JSON_END_MARKER: &str = "---END COURSE MATERIAL JSON---";
19
20/// System prompt for converting course material to markdown
21const SYSTEM_PROMPT: &str = r#"You are given course material in an abstract JSON format from a headless CMS. Convert this into clean, semantic Markdown that includes all user-visible content to support full-text search.
22
23* Extract and include all meaningful text content: paragraphs, headings, list items, image captions, and similar.
24* Retain any inline formatting (like bold or italic text), converting HTML tags (`<strong>`, `<em>`, etc.) into equivalent Markdown formatting.
25* For images, use the standard Markdown format: `![caption](url)`, including a caption if available.
26* Preserve heading levels (e.g., level 2 → `##`, level 3 → `###`).
27* Include text content from any block type, even non-standard ones, if it appears user-visible.
28* For exercise blocks, include the exercise name, and assignment instructions. You may also include text from the exercise specification (public spec), if it can be formatted into markdown.
29* If you encounter blocks that don't have any visible text in the JSON but are likely still user-visible (placeholder blocks) — e.g. `glossary`, `exercises-in-this-chapter`, `course-progress` — generate a fake heading representing the expected content (e.g. `## Glossary`).
30* Do not generate headings for placeholder blocks that are not user-visible — e.g. `conditionally-visible-content`, `spacer`, `divider`.
31* Exclude all purely stylistic attributes (e.g. colors, alignment, font sizes).
32* Do not include any metadata, HTML tags (other than for formatting), or non-visible fields.
33* Output **only the Markdown content**, and nothing else.
34"#;
35
36/// User prompt for converting course material to markdown
37const USER_PROMPT_START: &str =
38    "Convert this JSON content to clean markdown. Output only the markdown, nothing else.";
39
40/// Cleans content by converting the material blocks to clean markdown using an LLM
41#[instrument(skip(blocks, app_config, task_lm), fields(num_blocks = blocks.len()))]
42pub async fn convert_material_blocks_to_markdown_with_llm(
43    blocks: &[GutenbergBlock],
44    app_config: &ApplicationConfiguration,
45    task_lm: &TaskLMSpec,
46) -> anyhow::Result<String> {
47    debug!("Starting content conversion with {} blocks", blocks.len());
48    let system_message = APIMessage {
49        role: MessageRole::System,
50        fields: APIMessageKind::Text(APIMessageText {
51            content: SYSTEM_PROMPT.to_string(),
52        }),
53    };
54
55    let system_message_tokens = estimate_tokens(SYSTEM_PROMPT);
56    let safe_token_limit =
57        calculate_safe_token_limit(task_lm.context_size, task_lm.context_utilization);
58    let max_content_tokens = (safe_token_limit - system_message_tokens).max(1);
59
60    debug!(
61        "Token limits - system: {}, safe: {}, max content: {}",
62        system_message_tokens, safe_token_limit, max_content_tokens
63    );
64
65    let chunks = split_blocks_into_chunks(blocks, max_content_tokens)?;
66    debug!("Split content into {} chunks", chunks.len());
67    process_chunks(&chunks, &system_message, app_config, task_lm).await
68}
69
70/// Calculate the safe token limit based on context window and utilization
71pub fn calculate_safe_token_limit(context_window: i32, utilization: f32) -> i32 {
72    (context_window as f32 * utilization) as i32
73}
74
75/// Recursively removes all fields named "private_spec" from a JSON value
76fn remove_private_spec_recursive(value: &mut Value) {
77    match value {
78        Value::Object(map) => {
79            map.remove("private_spec");
80            for (_, v) in map.iter_mut() {
81                remove_private_spec_recursive(v);
82            }
83        }
84        Value::Array(arr) => {
85            for item in arr.iter_mut() {
86                remove_private_spec_recursive(item);
87            }
88        }
89        _ => {}
90    }
91}
92
93/// Converts a block to JSON string, removing any private_spec fields recursively
94fn block_to_json_string(block: &GutenbergBlock) -> anyhow::Result<String> {
95    let mut json_value = serde_json::to_value(block)?;
96    remove_private_spec_recursive(&mut json_value);
97    Ok(serde_json::to_string(&json_value)?)
98}
99
100/// Converts a vector of blocks to JSON string, removing any private_spec fields recursively
101fn blocks_to_json_string(blocks: &[GutenbergBlock]) -> anyhow::Result<String> {
102    let mut json_value = serde_json::to_value(blocks)?;
103    remove_private_spec_recursive(&mut json_value);
104    Ok(serde_json::to_string(&json_value)?)
105}
106
107/// Split blocks into chunks that fit within token limits
108#[instrument(skip(blocks), fields(max_content_tokens))]
109pub fn split_blocks_into_chunks(
110    blocks: &[GutenbergBlock],
111    max_content_tokens: i32,
112) -> anyhow::Result<Vec<String>> {
113    debug!("Starting to split {} blocks into chunks", blocks.len());
114    let mut chunks: Vec<String> = Vec::new();
115    let mut current_chunk: Vec<GutenbergBlock> = Vec::new();
116    let mut current_chunk_tokens = 0;
117
118    for block in blocks {
119        let block_json = block_to_json_string(block)?;
120        let block_tokens = estimate_tokens(&block_json);
121        debug!(
122            "Processing block {} with {} tokens",
123            block.client_id, block_tokens
124        );
125
126        // If this block alone exceeds the limit, split it into smaller chunks
127        if block_tokens > max_content_tokens {
128            warn!(
129                "Block {} exceeds max token limit ({} > {})",
130                block.client_id, block_tokens, max_content_tokens
131            );
132            // Add any accumulated blocks as a chunk
133            if !current_chunk.is_empty() {
134                chunks.push(blocks_to_json_string(&current_chunk)?);
135                current_chunk = Vec::new();
136                current_chunk_tokens = 0;
137            }
138
139            // Then we do some crude splitting for the oversized block
140            split_oversized_block(&block_json, max_content_tokens, &mut chunks)?;
141            continue;
142        }
143
144        if current_chunk_tokens + block_tokens > max_content_tokens {
145            debug!(
146                "Creating new chunk after {} blocks ({} tokens)",
147                current_chunk.len(),
148                current_chunk_tokens
149            );
150            chunks.push(blocks_to_json_string(&current_chunk)?);
151            current_chunk = Vec::new();
152            current_chunk_tokens = 0;
153        }
154
155        current_chunk.push(block.clone());
156        current_chunk_tokens += block_tokens;
157    }
158
159    if !current_chunk.is_empty() {
160        debug!(
161            "Adding final chunk with {} blocks ({} tokens)",
162            current_chunk.len(),
163            current_chunk_tokens
164        );
165        chunks.push(blocks_to_json_string(&current_chunk)?);
166    }
167
168    Ok(chunks)
169}
170
171/// Splits an oversized block into smaller string chunks
172#[instrument(skip(block_json, chunks), fields(max_tokens))]
173fn split_oversized_block(
174    block_json: &str,
175    max_tokens: i32,
176    chunks: &mut Vec<String>,
177) -> anyhow::Result<()> {
178    let total_tokens = estimate_tokens(block_json);
179    debug!(
180        "Splitting oversized block with {} tokens into chunks of max {} tokens",
181        total_tokens, max_tokens
182    );
183
184    // Make a very conservative estimate of the number of chunks we need
185    // Ensure max_tokens is at least 1 to avoid division by zero
186    let max_tokens_safe = max_tokens.max(1);
187    let num_chunks = (total_tokens as f32 / (max_tokens_safe as f32 * 0.5)).ceil() as usize;
188
189    if num_chunks <= 1 || num_chunks == 0 {
190        chunks.push(block_json.to_string());
191        return Ok(());
192    }
193
194    // Split by byte length (not character count) for efficiency,
195    // but ensure we only slice at UTF-8 character boundaries
196    let bytes_per_chunk = (block_json.len() / num_chunks).max(1);
197    debug!(
198        "Splitting into {} chunks of approximately {} bytes each",
199        num_chunks, bytes_per_chunk
200    );
201
202    let mut start = 0;
203    let mut iterations = 0;
204    const MAX_ITERATIONS: usize = 100;
205    while start < block_json.len() {
206        iterations += 1;
207        if iterations > MAX_ITERATIONS {
208            return Err(anyhow::anyhow!(
209                "Infinite loop protection: exceeded {} iterations in split_oversized_block",
210                MAX_ITERATIONS
211            ));
212        }
213
214        // Use checked arithmetic to prevent overflow
215        let end_candidate = start
216            .checked_add(bytes_per_chunk)
217            .unwrap_or(block_json.len())
218            .min(block_json.len());
219
220        let mut end = if end_candidate >= block_json.len() {
221            block_json.len()
222        } else {
223            end_candidate
224        };
225
226        // Adjust end backwards to the nearest UTF-8 character boundary
227        while !block_json.is_char_boundary(end) && end > start {
228            end -= 1;
229        }
230
231        // If backtracking resulted in end == start, advance forward to next boundary
232        if end == start {
233            // Find the next character boundary after start
234            let mut next_boundary = start
235                .checked_add(1)
236                .unwrap_or(block_json.len())
237                .min(block_json.len());
238
239            let mut boundary_iterations = 0;
240            const MAX_BOUNDARY_ITERATIONS: usize = 100;
241            while next_boundary < block_json.len() && !block_json.is_char_boundary(next_boundary) {
242                boundary_iterations += 1;
243                if boundary_iterations > MAX_BOUNDARY_ITERATIONS {
244                    return Err(anyhow::anyhow!(
245                        "Infinite loop protection: exceeded {} iterations finding character boundary",
246                        MAX_BOUNDARY_ITERATIONS
247                    ));
248                }
249                next_boundary = next_boundary
250                    .checked_add(1)
251                    .unwrap_or(block_json.len())
252                    .min(block_json.len());
253            }
254            end = next_boundary.min(block_json.len());
255        }
256
257        // Ensure we have a non-empty slice and valid bounds
258        if end > start && end <= block_json.len() && start < block_json.len() {
259            // Double-check bounds before slicing
260            let chunk = block_json.get(start..end).ok_or_else(|| {
261                anyhow::anyhow!("Invalid string slice bounds: {}..{}", start, end)
262            })?;
263            chunks.push(chunk.to_string());
264            let new_start = end;
265            // Safety check: ensure start always advances
266            if new_start <= start {
267                return Err(anyhow::anyhow!(
268                    "Infinite loop protection: start did not advance ({} -> {})",
269                    start,
270                    new_start
271                ));
272            }
273            start = new_start;
274        } else {
275            // Safety: if we can't make progress, break to avoid infinite loop
276            // Push remaining content if any
277            if start < block_json.len()
278                && let Some(remaining) = block_json.get(start..)
279                && !remaining.is_empty()
280            {
281                chunks.push(remaining.to_string());
282            }
283            break;
284        }
285    }
286
287    Ok(())
288}
289
290/// Appends markdown content to a result string with proper newline separators
291pub fn append_markdown_with_separator(result: &mut String, new_content: &str) {
292    if !result.is_empty() && !result.ends_with("\n\n") {
293        if result.ends_with('\n') {
294            result.push('\n');
295        } else {
296            result.push_str("\n\n");
297        }
298    }
299
300    result.push_str(new_content);
301}
302
303/// Process all chunks and combine the results
304#[instrument(skip(chunks, system_message, app_config, task_lm), fields(num_chunks = chunks.len()))]
305async fn process_chunks(
306    chunks: &[String],
307    system_message: &APIMessage,
308    app_config: &ApplicationConfiguration,
309    task_lm: &TaskLMSpec,
310) -> anyhow::Result<String> {
311    debug!("Processing {} chunks", chunks.len());
312    let mut result = String::new();
313
314    for (i, chunk) in chunks.iter().enumerate() {
315        debug!("Processing chunk {}/{}", i + 1, chunks.len());
316        let chunk_markdown =
317            process_block_chunk(chunk, system_message, app_config, task_lm).await?;
318        append_markdown_with_separator(&mut result, &chunk_markdown);
319    }
320
321    info!("Successfully cleaned content with LLM");
322    Ok(result)
323}
324
325/// Process a subset of blocks in a single LLM request
326#[instrument(skip(chunk, system_message, app_config, task_lm), fields(chunk_tokens = estimate_tokens(chunk)))]
327async fn process_block_chunk(
328    chunk: &str,
329    system_message: &APIMessage,
330    app_config: &ApplicationConfiguration,
331    task_lm: &TaskLMSpec,
332) -> ChatbotResult<String> {
333    let messages = prepare_llm_messages(chunk, system_message)?;
334    let params = if task_lm.thinking {
335        LLMRequestParams::Thinking(ThinkingParams {
336            max_completion_tokens: None,
337            verbosity: None,
338            reasoning_effort: None,
339            tools: vec![],
340            tool_choice: None,
341        })
342    } else {
343        LLMRequestParams::NonThinking(NonThinkingParams {
344            temperature: Some(REQUEST_TEMPERATURE),
345            top_p: None,
346            frequency_penalty: None,
347            presence_penalty: None,
348            max_tokens: None,
349        })
350    };
351    let llm_base_request: LLMRequest = LLMRequest {
352        messages,
353        data_sources: vec![],
354        params,
355        response_format: None,
356        stop: None,
357    };
358    info!(
359        "Processing chunk of approximately {} tokens",
360        estimate_tokens(chunk)
361    );
362
363    let completion = match make_blocking_llm_request(llm_base_request, app_config, task_lm).await {
364        Ok(completion) => completion,
365        Err(e) => {
366            error!("Failed to process chunk: {}", e);
367            return Err(ChatbotError::from(e));
368        }
369    };
370
371    parse_text_completion(completion)
372}
373
374/// Prepare messages for the LLM request
375pub fn prepare_llm_messages(
376    chunk: &str,
377    system_message: &APIMessage,
378) -> anyhow::Result<Vec<APIMessage>> {
379    let messages = vec![
380        system_message.clone(),
381        APIMessage {
382            role: MessageRole::User,
383            fields: APIMessageKind::Text(APIMessageText {
384                content: format!(
385                    "{}\n\n{}{}\n{}",
386                    USER_PROMPT_START, JSON_BEGIN_MARKER, chunk, JSON_END_MARKER
387                ),
388            }),
389        },
390    ];
391
392    Ok(messages)
393}
394
395#[cfg(test)]
396mod tests {
397    use super::*;
398    use crate::llm_utils::{APIMessageKind, APIMessageText};
399    use serde_json::json;
400
401    const TEST_BLOCK_NAME: &str = "test/block";
402
403    #[test]
404    fn test_calculate_safe_token_limit() {
405        assert_eq!(calculate_safe_token_limit(1000, 0.75), 750);
406        assert_eq!(calculate_safe_token_limit(16000, 0.75), 12000);
407        assert_eq!(calculate_safe_token_limit(8000, 0.5), 4000);
408    }
409
410    #[test]
411    fn test_append_markdown_with_separator() {
412        let mut result = String::new();
413        append_markdown_with_separator(&mut result, "New content");
414        assert_eq!(result, "New content");
415
416        let mut result = String::from("Existing content");
417        append_markdown_with_separator(&mut result, "New content");
418        assert_eq!(result, "Existing content\n\nNew content");
419
420        let mut result = String::from("Existing content\n");
421        append_markdown_with_separator(&mut result, "New content");
422        assert_eq!(result, "Existing content\n\nNew content");
423
424        let mut result = String::from("Existing content\n\n");
425        append_markdown_with_separator(&mut result, "New content");
426        assert_eq!(result, "Existing content\n\nNew content");
427    }
428
429    #[test]
430    fn test_split_blocks_into_chunks() -> anyhow::Result<()> {
431        // Use content strings of different lengths to influence token estimation
432        let block1 = create_test_block("a "); // short
433        let block2 = create_test_block("b b b b b b b b b b b b b b b b b b b b "); // longer
434        let block3 = create_test_block("c c c c c c c c c c c c c c c "); // medium
435
436        let blocks = vec![block1.clone(), block2.clone(), block3.clone()];
437
438        // Estimate tokens for each block
439        let t1 = estimate_tokens(&block_to_json_string(&block1)?);
440        let t2 = estimate_tokens(&block_to_json_string(&block2)?);
441        let t3 = estimate_tokens(&block_to_json_string(&block3)?);
442
443        // Test with a limit that fits all blocks
444        let chunks = split_blocks_into_chunks(&blocks, t1 + t2 + t3 + 10)?;
445        assert_eq!(chunks.len(), 1);
446
447        let deserialized_chunk: Vec<GutenbergBlock> = serde_json::from_str(&chunks[0])?;
448        assert_eq!(deserialized_chunk.len(), 3);
449
450        // Test with a limit that requires splitting after the first block
451        let chunks = split_blocks_into_chunks(&blocks, t1 + 1)?;
452
453        // First chunk should be a valid JSON array with one block
454        let first_chunk: Vec<GutenbergBlock> = serde_json::from_str(&chunks[0])?;
455        assert_eq!(first_chunk.len(), 1);
456        assert_eq!(first_chunk[0].client_id, block1.client_id);
457
458        // Remaining chunks might be split JSON strings, so we can't deserialize them
459        // Just verify they're not empty
460        for chunk in &chunks[1..] {
461            assert!(!chunk.is_empty());
462        }
463
464        Ok(())
465    }
466
467    #[test]
468    fn test_prepare_llm_messages() -> anyhow::Result<()> {
469        let blocks = vec![create_test_block("Test content")];
470        let blocks_json = blocks_to_json_string(&blocks)?;
471        let system_message = APIMessage {
472            role: MessageRole::System,
473            fields: APIMessageKind::Text(APIMessageText {
474                content: "System prompt".to_string(),
475            }),
476        };
477
478        let messages = prepare_llm_messages(&blocks_json, &system_message)?;
479
480        assert_eq!(messages.len(), 2);
481        let msg1_content = match &messages[0].fields {
482            APIMessageKind::Text(msg) => &msg.content,
483            _ => "",
484        };
485        let msg2_content = match &messages[1].fields {
486            APIMessageKind::Text(msg) => &msg.content,
487            _ => "",
488        };
489        assert_eq!(messages[0].role, MessageRole::System);
490        assert_eq!(msg1_content, "System prompt");
491        assert_eq!(messages[1].role, MessageRole::User);
492        assert!(msg2_content.contains(JSON_BEGIN_MARKER));
493        assert!(msg2_content.contains("Test content"));
494
495        Ok(())
496    }
497
498    fn create_test_block(content: &str) -> GutenbergBlock {
499        let client_id = uuid::Uuid::new_v4();
500        GutenbergBlock {
501            client_id,
502            name: TEST_BLOCK_NAME.to_string(),
503            is_valid: true,
504            attributes: {
505                let mut map = serde_json::Map::new();
506                map.insert("content".to_string(), json!(content));
507                map
508            },
509            inner_blocks: vec![],
510        }
511    }
512}