Skip to main content

headless_lms_chatbot/
content_cleaner.rs

1use crate::azure_chatbot::{
2    InputItem, LLMRequest, LLMRequestParams, NonThinkingParams, ThinkingParams,
3};
4use crate::llm_utils::{
5    APIInputMessage, MessageContent, estimate_tokens, make_blocking_llm_request, model_is_thinking,
6    parse_text_completion,
7};
8use crate::prelude::*;
9use headless_lms_models::application_task_default_language_models::TaskLMSpec;
10use headless_lms_models::chatbot_conversation_message_messages::MessageRole;
11use headless_lms_utils::document_schema_processor::GutenbergBlock;
12use serde_json::Value;
13use tracing::{debug, error, info, instrument, warn};
14
15/// Temperature for requests, low for deterministic results
16pub const REQUEST_TEMPERATURE: f32 = 0.1;
17
18/// JSON markers for LLM prompt
19const JSON_BEGIN_MARKER: &str = "---BEGIN COURSE MATERIAL JSON---";
20const JSON_END_MARKER: &str = "---END COURSE MATERIAL JSON---";
21
22/// System prompt for converting course material to markdown
23const SYSTEM_PROMPT: &str = r#"You are given course material in an abstract JSON format from a headless CMS. Convert this into clean, semantic Markdown that includes all user-visible content to support full-text search.
24
25* Extract and include all meaningful text content: paragraphs, headings, list items, image captions, and similar.
26* Retain any inline formatting (like bold or italic text), converting HTML tags (`<strong>`, `<em>`, etc.) into equivalent Markdown formatting.
27* For images, use the standard Markdown format: `![caption](url)`, including a caption if available.
28* Preserve heading levels (e.g., level 2 → `##`, level 3 → `###`).
29* Include text content from any block type, even non-standard ones, if it appears user-visible.
30* For exercise blocks, include the exercise name, and assignment instructions. You may also include text from the exercise specification (public spec), if it can be formatted into markdown.
31* If you encounter blocks that don't have any visible text in the JSON but are likely still user-visible (placeholder blocks) — e.g. `glossary`, `exercises-in-this-chapter`, `course-progress` — generate a fake heading representing the expected content (e.g. `## Glossary`).
32* Do not generate headings for placeholder blocks that are not user-visible — e.g. `conditionally-visible-content`, `spacer`, `divider`.
33* Exclude all purely stylistic attributes (e.g. colors, alignment, font sizes).
34* Do not include any metadata, HTML tags (other than for formatting), or non-visible fields.
35* Output **only the Markdown content**, and nothing else.
36"#;
37
38/// User prompt for converting course material to markdown
39const USER_PROMPT_START: &str =
40    "Convert this JSON content to clean markdown. Output only the markdown, nothing else.";
41
42/// Cleans content by converting the material blocks to clean markdown using an LLM
43#[instrument(skip(blocks, app_config, task_lm), fields(num_blocks = blocks.len()))]
44pub async fn convert_material_blocks_to_markdown_with_llm(
45    blocks: &[GutenbergBlock],
46    app_config: &ApplicationConfiguration,
47    task_lm: &TaskLMSpec,
48) -> anyhow::Result<String> {
49    debug!("Starting content conversion with {} blocks", blocks.len());
50    let system_message = APIInputMessage {
51        message_type: InputItem::Message {
52            role: MessageRole::System,
53            content: MessageContent::Text(SYSTEM_PROMPT.to_string()),
54        },
55    };
56
57    let system_message_tokens = estimate_tokens(SYSTEM_PROMPT);
58    let safe_token_limit =
59        calculate_safe_token_limit(task_lm.context_size, task_lm.context_utilization);
60    let max_content_tokens = (safe_token_limit - system_message_tokens).max(1);
61
62    debug!(
63        "Token limits - system: {}, safe: {}, max content: {}",
64        system_message_tokens, safe_token_limit, max_content_tokens
65    );
66
67    let chunks = split_blocks_into_chunks(blocks, max_content_tokens)?;
68    debug!("Split content into {} chunks", chunks.len());
69    process_chunks(&chunks, &system_message, app_config, task_lm).await
70}
71
72/// Calculate the safe token limit based on context window and utilization
73pub fn calculate_safe_token_limit(context_window: i32, utilization: f32) -> i32 {
74    (context_window as f32 * utilization) as i32
75}
76
77/// Recursively removes all fields named "private_spec" from a JSON value
78fn remove_private_spec_recursive(value: &mut Value) {
79    match value {
80        Value::Object(map) => {
81            map.remove("private_spec");
82            for (_, v) in map.iter_mut() {
83                remove_private_spec_recursive(v);
84            }
85        }
86        Value::Array(arr) => {
87            for item in arr.iter_mut() {
88                remove_private_spec_recursive(item);
89            }
90        }
91        _ => {}
92    }
93}
94
95/// Converts a block to JSON string, removing any private_spec fields recursively
96fn block_to_json_string(block: &GutenbergBlock) -> anyhow::Result<String> {
97    let mut json_value = serde_json::to_value(block)?;
98    remove_private_spec_recursive(&mut json_value);
99    Ok(serde_json::to_string(&json_value)?)
100}
101
102/// Converts a vector of blocks to JSON string, removing any private_spec fields recursively
103fn blocks_to_json_string(blocks: &[GutenbergBlock]) -> anyhow::Result<String> {
104    let mut json_value = serde_json::to_value(blocks)?;
105    remove_private_spec_recursive(&mut json_value);
106    Ok(serde_json::to_string(&json_value)?)
107}
108
109/// Split blocks into chunks that fit within token limits
110#[instrument(skip(blocks), fields(max_content_tokens))]
111pub fn split_blocks_into_chunks(
112    blocks: &[GutenbergBlock],
113    max_content_tokens: i32,
114) -> anyhow::Result<Vec<String>> {
115    debug!("Starting to split {} blocks into chunks", blocks.len());
116    let mut chunks: Vec<String> = Vec::new();
117    let mut current_chunk: Vec<GutenbergBlock> = Vec::new();
118    let mut current_chunk_tokens = 0;
119
120    for block in blocks {
121        let block_json = block_to_json_string(block)?;
122        let block_tokens = estimate_tokens(&block_json);
123        debug!(
124            "Processing block {} with {} tokens",
125            block.client_id, block_tokens
126        );
127
128        // If this block alone exceeds the limit, split it into smaller chunks
129        if block_tokens > max_content_tokens {
130            warn!(
131                "Block {} exceeds max token limit ({} > {})",
132                block.client_id, block_tokens, max_content_tokens
133            );
134            // Add any accumulated blocks as a chunk
135            if !current_chunk.is_empty() {
136                chunks.push(blocks_to_json_string(&current_chunk)?);
137                current_chunk = Vec::new();
138                current_chunk_tokens = 0;
139            }
140
141            // Then we do some crude splitting for the oversized block
142            split_oversized_block(&block_json, max_content_tokens, &mut chunks)?;
143            continue;
144        }
145
146        if current_chunk_tokens + block_tokens > max_content_tokens {
147            debug!(
148                "Creating new chunk after {} blocks ({} tokens)",
149                current_chunk.len(),
150                current_chunk_tokens
151            );
152            chunks.push(blocks_to_json_string(&current_chunk)?);
153            current_chunk = Vec::new();
154            current_chunk_tokens = 0;
155        }
156
157        current_chunk.push(block.clone());
158        current_chunk_tokens += block_tokens;
159    }
160
161    if !current_chunk.is_empty() {
162        debug!(
163            "Adding final chunk with {} blocks ({} tokens)",
164            current_chunk.len(),
165            current_chunk_tokens
166        );
167        chunks.push(blocks_to_json_string(&current_chunk)?);
168    }
169
170    Ok(chunks)
171}
172
173/// Splits an oversized block into smaller string chunks
174#[instrument(skip(block_json, chunks), fields(max_tokens))]
175fn split_oversized_block(
176    block_json: &str,
177    max_tokens: i32,
178    chunks: &mut Vec<String>,
179) -> anyhow::Result<()> {
180    let total_tokens = estimate_tokens(block_json);
181    debug!(
182        "Splitting oversized block with {} tokens into chunks of max {} tokens",
183        total_tokens, max_tokens
184    );
185
186    // Make a very conservative estimate of the number of chunks we need
187    // Ensure max_tokens is at least 1 to avoid division by zero
188    let max_tokens_safe = max_tokens.max(1);
189    let num_chunks = (total_tokens as f32 / (max_tokens_safe as f32 * 0.5)).ceil() as usize;
190
191    if num_chunks <= 1 || num_chunks == 0 {
192        chunks.push(block_json.to_string());
193        return Ok(());
194    }
195
196    // Split by byte length (not character count) for efficiency,
197    // but ensure we only slice at UTF-8 character boundaries
198    let bytes_per_chunk = (block_json.len() / num_chunks).max(1);
199    debug!(
200        "Splitting into {} chunks of approximately {} bytes each",
201        num_chunks, bytes_per_chunk
202    );
203
204    let mut start = 0;
205    let mut iterations = 0;
206    const MAX_ITERATIONS: usize = 100;
207    while start < block_json.len() {
208        iterations += 1;
209        if iterations > MAX_ITERATIONS {
210            return Err(anyhow::anyhow!(
211                "Infinite loop protection: exceeded {} iterations in split_oversized_block",
212                MAX_ITERATIONS
213            ));
214        }
215
216        // Use checked arithmetic to prevent overflow
217        let end_candidate = start
218            .checked_add(bytes_per_chunk)
219            .unwrap_or(block_json.len())
220            .min(block_json.len());
221
222        let mut end = if end_candidate >= block_json.len() {
223            block_json.len()
224        } else {
225            end_candidate
226        };
227
228        // Adjust end backwards to the nearest UTF-8 character boundary
229        while !block_json.is_char_boundary(end) && end > start {
230            end -= 1;
231        }
232
233        // If backtracking resulted in end == start, advance forward to next boundary
234        if end == start {
235            // Find the next character boundary after start
236            let mut next_boundary = start
237                .checked_add(1)
238                .unwrap_or(block_json.len())
239                .min(block_json.len());
240
241            let mut boundary_iterations = 0;
242            const MAX_BOUNDARY_ITERATIONS: usize = 100;
243            while next_boundary < block_json.len() && !block_json.is_char_boundary(next_boundary) {
244                boundary_iterations += 1;
245                if boundary_iterations > MAX_BOUNDARY_ITERATIONS {
246                    return Err(anyhow::anyhow!(
247                        "Infinite loop protection: exceeded {} iterations finding character boundary",
248                        MAX_BOUNDARY_ITERATIONS
249                    ));
250                }
251                next_boundary = next_boundary
252                    .checked_add(1)
253                    .unwrap_or(block_json.len())
254                    .min(block_json.len());
255            }
256            end = next_boundary.min(block_json.len());
257        }
258
259        // Ensure we have a non-empty slice and valid bounds
260        if end > start && end <= block_json.len() && start < block_json.len() {
261            // Double-check bounds before slicing
262            let chunk = block_json.get(start..end).ok_or_else(|| {
263                anyhow::anyhow!("Invalid string slice bounds: {}..{}", start, end)
264            })?;
265            chunks.push(chunk.to_string());
266            let new_start = end;
267            // Safety check: ensure start always advances
268            if new_start <= start {
269                return Err(anyhow::anyhow!(
270                    "Infinite loop protection: start did not advance ({} -> {})",
271                    start,
272                    new_start
273                ));
274            }
275            start = new_start;
276        } else {
277            // Safety: if we can't make progress, break to avoid infinite loop
278            // Push remaining content if any
279            if start < block_json.len()
280                && let Some(remaining) = block_json.get(start..)
281                && !remaining.is_empty()
282            {
283                chunks.push(remaining.to_string());
284            }
285            break;
286        }
287    }
288
289    Ok(())
290}
291
292/// Appends markdown content to a result string with proper newline separators
293pub fn append_markdown_with_separator(result: &mut String, new_content: &str) {
294    if !result.is_empty() && !result.ends_with("\n\n") {
295        if result.ends_with('\n') {
296            result.push('\n');
297        } else {
298            result.push_str("\n\n");
299        }
300    }
301
302    result.push_str(new_content);
303}
304
305/// Process all chunks and combine the results
306#[instrument(skip(chunks, system_message, app_config, task_lm), fields(num_chunks = chunks.len()))]
307async fn process_chunks(
308    chunks: &[String],
309    system_message: &APIInputMessage,
310    app_config: &ApplicationConfiguration,
311    task_lm: &TaskLMSpec,
312) -> anyhow::Result<String> {
313    debug!("Processing {} chunks", chunks.len());
314    let mut result = String::new();
315
316    for (i, chunk) in chunks.iter().enumerate() {
317        debug!("Processing chunk {}/{}", i + 1, chunks.len());
318        let chunk_markdown =
319            process_block_chunk(chunk, system_message, app_config, task_lm).await?;
320        append_markdown_with_separator(&mut result, &chunk_markdown);
321    }
322
323    info!("Successfully cleaned content with LLM");
324    Ok(result)
325}
326
327/// Process a subset of blocks in a single LLM request
328#[instrument(skip(chunk, system_message, app_config, task_lm), fields(chunk_tokens = estimate_tokens(chunk)))]
329async fn process_block_chunk(
330    chunk: &str,
331    system_message: &APIInputMessage,
332    app_config: &ApplicationConfiguration,
333    task_lm: &TaskLMSpec,
334) -> ChatbotResult<String> {
335    let input = prepare_llm_messages(chunk, system_message)?;
336    let params = if model_is_thinking(task_lm.model_type) {
337        LLMRequestParams::GPTThinking(ThinkingParams { reasoning: None })
338    } else {
339        LLMRequestParams::GPTNonThinking(NonThinkingParams {
340            temperature: Some(REQUEST_TEMPERATURE),
341            top_p: None,
342            frequency_penalty: None,
343            presence_penalty: None,
344        })
345    };
346    let llm_base_request = LLMRequest {
347        input,
348        max_output_tokens: None,
349        model: task_lm.model.to_owned(),
350        tools: vec![],
351        tool_choice: None,
352        params,
353        text: None,
354    };
355    info!(
356        "Processing chunk of approximately {} tokens",
357        estimate_tokens(chunk)
358    );
359
360    let completion = match make_blocking_llm_request(llm_base_request, app_config).await {
361        Ok(completion) => completion,
362        Err(e) => {
363            error!("Failed to process chunk: {}", e);
364            return Err(ChatbotError::from(e));
365        }
366    };
367
368    parse_text_completion(completion)
369}
370
371/// Prepare messages for the LLM request
372pub fn prepare_llm_messages(
373    chunk: &str,
374    system_message: &APIInputMessage,
375) -> anyhow::Result<Vec<APIInputMessage>> {
376    let content = format!(
377        "{}\n\n{}{}\n{}",
378        USER_PROMPT_START, JSON_BEGIN_MARKER, chunk, JSON_END_MARKER
379    );
380    let messages = vec![
381        system_message.clone(),
382        APIInputMessage {
383            message_type: InputItem::Message {
384                role: MessageRole::User,
385                content: MessageContent::Text(content),
386            },
387        },
388    ];
389
390    Ok(messages)
391}
392
393#[cfg(test)]
394mod tests {
395    use super::*;
396    use serde_json::json;
397
398    const TEST_BLOCK_NAME: &str = "test/block";
399
400    #[test]
401    fn test_calculate_safe_token_limit() {
402        assert_eq!(calculate_safe_token_limit(1000, 0.75), 750);
403        assert_eq!(calculate_safe_token_limit(16000, 0.75), 12000);
404        assert_eq!(calculate_safe_token_limit(8000, 0.5), 4000);
405    }
406
407    #[test]
408    fn test_append_markdown_with_separator() {
409        let mut result = String::new();
410        append_markdown_with_separator(&mut result, "New content");
411        assert_eq!(result, "New content");
412
413        let mut result = String::from("Existing content");
414        append_markdown_with_separator(&mut result, "New content");
415        assert_eq!(result, "Existing content\n\nNew content");
416
417        let mut result = String::from("Existing content\n");
418        append_markdown_with_separator(&mut result, "New content");
419        assert_eq!(result, "Existing content\n\nNew content");
420
421        let mut result = String::from("Existing content\n\n");
422        append_markdown_with_separator(&mut result, "New content");
423        assert_eq!(result, "Existing content\n\nNew content");
424    }
425
426    #[test]
427    fn test_split_blocks_into_chunks() -> anyhow::Result<()> {
428        // Use content strings of different lengths to influence token estimation
429        let block1 = create_test_block("a "); // short
430        let block2 = create_test_block("b b b b b b b b b b b b b b b b b b b b "); // longer
431        let block3 = create_test_block("c c c c c c c c c c c c c c c "); // medium
432
433        let blocks = vec![block1.clone(), block2.clone(), block3.clone()];
434
435        // Estimate tokens for each block
436        let t1 = estimate_tokens(&block_to_json_string(&block1)?);
437        let t2 = estimate_tokens(&block_to_json_string(&block2)?);
438        let t3 = estimate_tokens(&block_to_json_string(&block3)?);
439
440        // Test with a limit that fits all blocks
441        let chunks = split_blocks_into_chunks(&blocks, t1 + t2 + t3 + 10)?;
442        assert_eq!(chunks.len(), 1);
443
444        let deserialized_chunk: Vec<GutenbergBlock> = serde_json::from_str(&chunks[0])?;
445        assert_eq!(deserialized_chunk.len(), 3);
446
447        // Test with a limit that requires splitting after the first block
448        let chunks = split_blocks_into_chunks(&blocks, t1 + 1)?;
449
450        // First chunk should be a valid JSON array with one block
451        let first_chunk: Vec<GutenbergBlock> = serde_json::from_str(&chunks[0])?;
452        assert_eq!(first_chunk.len(), 1);
453        assert_eq!(first_chunk[0].client_id, block1.client_id);
454
455        // Remaining chunks might be split JSON strings, so we can't deserialize them
456        // Just verify they're not empty
457        for chunk in &chunks[1..] {
458            assert!(!chunk.is_empty());
459        }
460
461        Ok(())
462    }
463
464    #[test]
465    fn test_prepare_llm_messages() -> anyhow::Result<()> {
466        let blocks = vec![create_test_block("Test content")];
467        let blocks_json = blocks_to_json_string(&blocks)?;
468        let system_message = APIInputMessage {
469            message_type: InputItem::Message {
470                role: MessageRole::System,
471                content: MessageContent::Text("System prompt".to_string()),
472            },
473        };
474
475        let messages = prepare_llm_messages(&blocks_json, &system_message)?;
476
477        assert_eq!(messages.len(), 2);
478        let (msg1_content, msg1_role): (&str, Option<&MessageRole>) =
479            match &messages[0].message_type {
480                InputItem::Message { role, content } => {
481                    (&content.to_owned().get_content_text(), Some(role))
482                }
483                _ => ("", None),
484            };
485        let (msg2_content, msg2_role): (&str, Option<&MessageRole>) =
486            match &messages[1].message_type {
487                InputItem::Message { role, content } => {
488                    (&content.to_owned().get_content_text(), Some(role))
489                }
490                _ => ("", None),
491            };
492        assert_eq!(msg1_role, Some(&MessageRole::System));
493        assert_eq!(msg1_content, "System prompt");
494        assert_eq!(msg2_role, Some(&MessageRole::User));
495        assert!(msg2_content.contains(JSON_BEGIN_MARKER));
496        assert!(msg2_content.contains("Test content"));
497
498        Ok(())
499    }
500
501    fn create_test_block(content: &str) -> GutenbergBlock {
502        let client_id = uuid::Uuid::new_v4();
503        GutenbergBlock {
504            client_id,
505            name: TEST_BLOCK_NAME.to_string(),
506            is_valid: true,
507            attributes: {
508                let mut map = serde_json::Map::new();
509                map.insert("content".to_string(), json!(content));
510                map
511            },
512            inner_blocks: vec![],
513        }
514    }
515}