headless_lms_chatbot/
content_cleaner.rs

1use crate::azure_chatbot::{LLMRequest, LLMRequestParams, NonThinkingParams};
2use crate::llm_utils::{APIMessage, MessageRole, estimate_tokens, make_blocking_llm_request};
3use crate::prelude::*;
4use headless_lms_utils::document_schema_processor::GutenbergBlock;
5use tracing::{debug, error, info, instrument, warn};
6
7/// Maximum context window size for LLM in tokens
8pub const MAX_CONTEXT_WINDOW: i32 = 16000;
9/// Maximum percentage of context window to use in a single request
10pub const MAX_CONTEXT_UTILIZATION: f32 = 0.75;
11/// Temperature for requests, low for deterministic results
12pub const REQUEST_TEMPERATURE: f32 = 0.1;
13
14/// JSON markers for LLM prompt
15const JSON_BEGIN_MARKER: &str = "---BEGIN COURSE MATERIAL JSON---";
16const JSON_END_MARKER: &str = "---END COURSE MATERIAL JSON---";
17
18/// System prompt for converting course material to markdown
19const SYSTEM_PROMPT: &str = r#"You are given course material in an abstract JSON format from a headless CMS. Convert this into clean, semantic Markdown that includes all user-visible content to support full-text search.
20
21* Extract and include all meaningful text content: paragraphs, headings, list items, image captions, and similar.
22* Retain any inline formatting (like bold or italic text), converting HTML tags (`<strong>`, `<em>`, etc.) into equivalent Markdown formatting.
23* For images, use the standard Markdown format: `![caption](url)`, including a caption if available.
24* Preserve heading levels (e.g., level 2 → `##`, level 3 → `###`).
25* Include text content from any block type, even non-standard ones, if it appears user-visible.
26* For exercise blocks, include the exercise name, and assignment instructions. You may also include text from the exercise specification (public spec), if it can be formatted into markdown.
27* If you encounter blocks that don't have any visible text in the JSON but are likely still user-visible (placeholder blocks) — e.g. `glossary`, `exercises-in-this-chapter`, `course-progress` — generate a fake heading representing the expected content (e.g. `## Glossary`).
28* Do not generate headings for placeholder blocks that are not user-visible — e.g. `conditionally-visible-content`, `spacer`, `divider`.
29* Exclude all purely stylistic attributes (e.g. colors, alignment, font sizes).
30* Do not include any metadata, HTML tags (other than for formatting), or non-visible fields.
31* Output **only the Markdown content**, and nothing else.
32"#;
33
34/// User prompt for converting course material to markdown
35const USER_PROMPT_START: &str =
36    "Convert this JSON content to clean markdown. Output only the markdown, nothing else.";
37
38/// Cleans content by converting the material blocks to clean markdown using an LLM
39#[instrument(skip(blocks, app_config), fields(num_blocks = blocks.len()))]
40pub async fn convert_material_blocks_to_markdown_with_llm(
41    blocks: &[GutenbergBlock],
42    app_config: &ApplicationConfiguration,
43) -> anyhow::Result<String> {
44    debug!("Starting content conversion with {} blocks", blocks.len());
45    let system_message = APIMessage {
46        role: MessageRole::System,
47        content: SYSTEM_PROMPT.to_string(),
48    };
49
50    let system_message_tokens = estimate_tokens(&system_message.content);
51    let safe_token_limit = calculate_safe_token_limit(MAX_CONTEXT_WINDOW, MAX_CONTEXT_UTILIZATION);
52    let max_content_tokens = safe_token_limit - system_message_tokens;
53
54    debug!(
55        "Token limits - system: {}, safe: {}, max content: {}",
56        system_message_tokens, safe_token_limit, max_content_tokens
57    );
58
59    let chunks = split_blocks_into_chunks(blocks, max_content_tokens)?;
60    debug!("Split content into {} chunks", chunks.len());
61    process_chunks(&chunks, &system_message, app_config).await
62}
63
64/// Calculate the safe token limit based on context window and utilization
65pub fn calculate_safe_token_limit(context_window: i32, utilization: f32) -> i32 {
66    (context_window as f32 * utilization) as i32
67}
68
69/// Split blocks into chunks that fit within token limits
70#[instrument(skip(blocks), fields(max_content_tokens))]
71pub fn split_blocks_into_chunks(
72    blocks: &[GutenbergBlock],
73    max_content_tokens: i32,
74) -> anyhow::Result<Vec<String>> {
75    debug!("Starting to split {} blocks into chunks", blocks.len());
76    let mut chunks: Vec<String> = Vec::new();
77    let mut current_chunk: Vec<GutenbergBlock> = Vec::new();
78    let mut current_chunk_tokens = 0;
79
80    for block in blocks {
81        let block_json = serde_json::to_string(block)?;
82        let block_tokens = estimate_tokens(&block_json);
83        debug!(
84            "Processing block {} with {} tokens",
85            block.client_id, block_tokens
86        );
87
88        // If this block alone exceeds the limit, split it into smaller chunks
89        if block_tokens > max_content_tokens {
90            warn!(
91                "Block {} exceeds max token limit ({} > {})",
92                block.client_id, block_tokens, max_content_tokens
93            );
94            // Add any accumulated blocks as a chunk
95            if !current_chunk.is_empty() {
96                chunks.push(serde_json::to_string(&current_chunk)?);
97                current_chunk = Vec::new();
98                current_chunk_tokens = 0;
99            }
100
101            // Then we do some crude splitting for the oversized block
102            split_oversized_block(&block_json, max_content_tokens, &mut chunks)?;
103            continue;
104        }
105
106        if current_chunk_tokens + block_tokens > max_content_tokens {
107            debug!(
108                "Creating new chunk after {} blocks ({} tokens)",
109                current_chunk.len(),
110                current_chunk_tokens
111            );
112            chunks.push(serde_json::to_string(&current_chunk)?);
113            current_chunk = Vec::new();
114            current_chunk_tokens = 0;
115        }
116
117        current_chunk.push(block.clone());
118        current_chunk_tokens += block_tokens;
119    }
120
121    if !current_chunk.is_empty() {
122        debug!(
123            "Adding final chunk with {} blocks ({} tokens)",
124            current_chunk.len(),
125            current_chunk_tokens
126        );
127        chunks.push(serde_json::to_string(&current_chunk)?);
128    }
129
130    Ok(chunks)
131}
132
133/// Splits an oversized block into smaller string chunks
134#[instrument(skip(block_json, chunks), fields(max_tokens))]
135fn split_oversized_block(
136    block_json: &str,
137    max_tokens: i32,
138    chunks: &mut Vec<String>,
139) -> anyhow::Result<()> {
140    let total_tokens = estimate_tokens(block_json);
141    debug!(
142        "Splitting oversized block with {} tokens into chunks of max {} tokens",
143        total_tokens, max_tokens
144    );
145
146    // Make a very conservative estimate of the number of chunks we need
147    let num_chunks = (total_tokens as f32 / (max_tokens as f32 * 0.5)).ceil() as usize;
148
149    if num_chunks <= 1 {
150        chunks.push(block_json.to_string());
151        return Ok(());
152    }
153
154    let chars_per_chunk = block_json.len() / num_chunks;
155    debug!(
156        "Splitting into {} chunks of approximately {} chars each",
157        num_chunks, chars_per_chunk
158    );
159
160    let mut start = 0;
161    while start < block_json.len() {
162        let end = if start + chars_per_chunk >= block_json.len() {
163            block_json.len()
164        } else {
165            start + chars_per_chunk
166        };
167
168        let chunk = &block_json[start..end];
169        chunks.push(chunk.to_string());
170
171        start = end;
172    }
173
174    Ok(())
175}
176
177/// Appends markdown content to a result string with proper newline separators
178pub fn append_markdown_with_separator(result: &mut String, new_content: &str) {
179    if !result.is_empty() && !result.ends_with("\n\n") {
180        if result.ends_with('\n') {
181            result.push('\n');
182        } else {
183            result.push_str("\n\n");
184        }
185    }
186
187    result.push_str(new_content);
188}
189
190/// Process all chunks and combine the results
191#[instrument(skip(chunks, system_message, app_config), fields(num_chunks = chunks.len()))]
192async fn process_chunks(
193    chunks: &[String],
194    system_message: &APIMessage,
195    app_config: &ApplicationConfiguration,
196) -> anyhow::Result<String> {
197    debug!("Processing {} chunks", chunks.len());
198    let mut result = String::new();
199
200    for (i, chunk) in chunks.iter().enumerate() {
201        debug!("Processing chunk {}/{}", i + 1, chunks.len());
202        let chunk_markdown = process_block_chunk(chunk, system_message, app_config).await?;
203        append_markdown_with_separator(&mut result, &chunk_markdown);
204    }
205
206    info!("Successfully cleaned content with LLM");
207    Ok(result)
208}
209
210/// Process a subset of blocks in a single LLM request
211#[instrument(skip(chunk, system_message, app_config), fields(chunk_tokens = estimate_tokens(chunk)))]
212async fn process_block_chunk(
213    chunk: &str,
214    system_message: &APIMessage,
215    app_config: &ApplicationConfiguration,
216) -> anyhow::Result<String> {
217    let messages = prepare_llm_messages(chunk, system_message)?;
218    let llm_base_request: LLMRequest = LLMRequest {
219        messages,
220        data_sources: vec![],
221        params: LLMRequestParams::NonThinking(NonThinkingParams {
222            temperature: Some(REQUEST_TEMPERATURE),
223            top_p: None,
224            frequency_penalty: None,
225            presence_penalty: None,
226            max_tokens: None,
227        }),
228        stop: None,
229    };
230    info!(
231        "Processing chunk of approximately {} tokens",
232        estimate_tokens(chunk)
233    );
234
235    let completion = match make_blocking_llm_request(llm_base_request, app_config).await {
236        Ok(completion) => completion,
237        Err(e) => {
238            error!("Failed to process chunk: {}", e);
239            return Err(e);
240        }
241    };
242
243    let cleaned_content = completion
244        .choices
245        .first()
246        .ok_or_else(|| {
247            error!("No content returned from LLM");
248            anyhow::anyhow!("No content returned from LLM")
249        })?
250        .message
251        .content
252        .clone();
253
254    Ok(cleaned_content)
255}
256
257/// Prepare messages for the LLM request
258pub fn prepare_llm_messages(
259    chunk: &str,
260    system_message: &APIMessage,
261) -> anyhow::Result<Vec<APIMessage>> {
262    let messages = vec![
263        system_message.clone(),
264        APIMessage {
265            role: MessageRole::User,
266            content: format!(
267                "{}\n\n{}{}\n{}",
268                USER_PROMPT_START, JSON_BEGIN_MARKER, chunk, JSON_END_MARKER
269            ),
270        },
271    ];
272
273    Ok(messages)
274}
275
276#[cfg(test)]
277mod tests {
278    use super::*;
279    use serde_json::json;
280
281    const TEST_BLOCK_NAME: &str = "test/block";
282
283    #[test]
284    fn test_calculate_safe_token_limit() {
285        assert_eq!(calculate_safe_token_limit(1000, 0.75), 750);
286        assert_eq!(calculate_safe_token_limit(16000, 0.75), 12000);
287        assert_eq!(calculate_safe_token_limit(8000, 0.5), 4000);
288    }
289
290    #[test]
291    fn test_append_markdown_with_separator() {
292        let mut result = String::new();
293        append_markdown_with_separator(&mut result, "New content");
294        assert_eq!(result, "New content");
295
296        let mut result = String::from("Existing content");
297        append_markdown_with_separator(&mut result, "New content");
298        assert_eq!(result, "Existing content\n\nNew content");
299
300        let mut result = String::from("Existing content\n");
301        append_markdown_with_separator(&mut result, "New content");
302        assert_eq!(result, "Existing content\n\nNew content");
303
304        let mut result = String::from("Existing content\n\n");
305        append_markdown_with_separator(&mut result, "New content");
306        assert_eq!(result, "Existing content\n\nNew content");
307    }
308
309    #[test]
310    fn test_split_blocks_into_chunks() -> anyhow::Result<()> {
311        // Use content strings of different lengths to influence token estimation
312        let block1 = create_test_block("a "); // short
313        let block2 = create_test_block("b b b b b b b b b b b b b b b b b b b b "); // longer
314        let block3 = create_test_block("c c c c c c c c c c c c c c c "); // medium
315
316        let blocks = vec![block1.clone(), block2.clone(), block3.clone()];
317
318        // Estimate tokens for each block
319        let t1 = estimate_tokens(&serde_json::to_string(&block1)?);
320        let t2 = estimate_tokens(&serde_json::to_string(&block2)?);
321        let t3 = estimate_tokens(&serde_json::to_string(&block3)?);
322
323        // Test with a limit that fits all blocks
324        let chunks = split_blocks_into_chunks(&blocks, t1 + t2 + t3 + 10)?;
325        assert_eq!(chunks.len(), 1);
326
327        let deserialized_chunk: Vec<GutenbergBlock> = serde_json::from_str(&chunks[0])?;
328        assert_eq!(deserialized_chunk.len(), 3);
329
330        // Test with a limit that requires splitting after the first block
331        let chunks = split_blocks_into_chunks(&blocks, t1 + 1)?;
332
333        // First chunk should be a valid JSON array with one block
334        let first_chunk: Vec<GutenbergBlock> = serde_json::from_str(&chunks[0])?;
335        assert_eq!(first_chunk.len(), 1);
336        assert_eq!(first_chunk[0].client_id, block1.client_id);
337
338        // Remaining chunks might be split JSON strings, so we can't deserialize them
339        // Just verify they're not empty
340        for chunk in &chunks[1..] {
341            assert!(!chunk.is_empty());
342        }
343
344        Ok(())
345    }
346
347    #[test]
348    fn test_prepare_llm_messages() -> anyhow::Result<()> {
349        let blocks = vec![create_test_block("Test content")];
350        let blocks_json = serde_json::to_string(&blocks)?;
351        let system_message = APIMessage {
352            role: MessageRole::System,
353            content: "System prompt".to_string(),
354        };
355
356        let messages = prepare_llm_messages(&blocks_json, &system_message)?;
357
358        assert_eq!(messages.len(), 2);
359        assert_eq!(messages[0].role, MessageRole::System);
360        assert_eq!(messages[0].content, "System prompt");
361        assert_eq!(messages[1].role, MessageRole::User);
362        assert!(messages[1].content.contains(JSON_BEGIN_MARKER));
363        assert!(messages[1].content.contains("Test content"));
364
365        Ok(())
366    }
367
368    fn create_test_block(content: &str) -> GutenbergBlock {
369        let client_id = uuid::Uuid::new_v4();
370        GutenbergBlock {
371            client_id,
372            name: TEST_BLOCK_NAME.to_string(),
373            is_valid: true,
374            attributes: {
375                let mut map = serde_json::Map::new();
376                map.insert("content".to_string(), json!(content));
377                map
378            },
379            inner_blocks: vec![],
380        }
381    }
382}