1use crate::azure_chatbot::{LLMRequest, LLMRequestParams, NonThinkingParams};
2use crate::llm_utils::{APIMessage, MessageRole, estimate_tokens, make_blocking_llm_request};
3use crate::prelude::*;
4use headless_lms_utils::document_schema_processor::GutenbergBlock;
5use serde_json::Value;
6use tracing::{debug, error, info, instrument, warn};
7
8pub const MAX_CONTEXT_WINDOW: i32 = 16000;
10pub const MAX_CONTEXT_UTILIZATION: f32 = 0.75;
12pub const REQUEST_TEMPERATURE: f32 = 0.1;
14
15const JSON_BEGIN_MARKER: &str = "---BEGIN COURSE MATERIAL JSON---";
17const JSON_END_MARKER: &str = "---END COURSE MATERIAL JSON---";
18
19const SYSTEM_PROMPT: &str = r#"You are given course material in an abstract JSON format from a headless CMS. Convert this into clean, semantic Markdown that includes all user-visible content to support full-text search.
21
22* Extract and include all meaningful text content: paragraphs, headings, list items, image captions, and similar.
23* Retain any inline formatting (like bold or italic text), converting HTML tags (`<strong>`, `<em>`, etc.) into equivalent Markdown formatting.
24* For images, use the standard Markdown format: ``, including a caption if available.
25* Preserve heading levels (e.g., level 2 → `##`, level 3 → `###`).
26* Include text content from any block type, even non-standard ones, if it appears user-visible.
27* For exercise blocks, include the exercise name, and assignment instructions. You may also include text from the exercise specification (public spec), if it can be formatted into markdown.
28* If you encounter blocks that don't have any visible text in the JSON but are likely still user-visible (placeholder blocks) — e.g. `glossary`, `exercises-in-this-chapter`, `course-progress` — generate a fake heading representing the expected content (e.g. `## Glossary`).
29* Do not generate headings for placeholder blocks that are not user-visible — e.g. `conditionally-visible-content`, `spacer`, `divider`.
30* Exclude all purely stylistic attributes (e.g. colors, alignment, font sizes).
31* Do not include any metadata, HTML tags (other than for formatting), or non-visible fields.
32* Output **only the Markdown content**, and nothing else.
33"#;
34
35const USER_PROMPT_START: &str =
37 "Convert this JSON content to clean markdown. Output only the markdown, nothing else.";
38
39#[instrument(skip(blocks, app_config), fields(num_blocks = blocks.len()))]
41pub async fn convert_material_blocks_to_markdown_with_llm(
42 blocks: &[GutenbergBlock],
43 app_config: &ApplicationConfiguration,
44) -> anyhow::Result<String> {
45 debug!("Starting content conversion with {} blocks", blocks.len());
46 let system_message = APIMessage {
47 role: MessageRole::System,
48 content: SYSTEM_PROMPT.to_string(),
49 };
50
51 let system_message_tokens = estimate_tokens(&system_message.content);
52 let safe_token_limit = calculate_safe_token_limit(MAX_CONTEXT_WINDOW, MAX_CONTEXT_UTILIZATION);
53 let max_content_tokens = (safe_token_limit - system_message_tokens).max(1);
54
55 debug!(
56 "Token limits - system: {}, safe: {}, max content: {}",
57 system_message_tokens, safe_token_limit, max_content_tokens
58 );
59
60 let chunks = split_blocks_into_chunks(blocks, max_content_tokens)?;
61 debug!("Split content into {} chunks", chunks.len());
62 process_chunks(&chunks, &system_message, app_config).await
63}
64
65pub fn calculate_safe_token_limit(context_window: i32, utilization: f32) -> i32 {
67 (context_window as f32 * utilization) as i32
68}
69
70fn remove_private_spec_recursive(value: &mut Value) {
72 match value {
73 Value::Object(map) => {
74 map.remove("private_spec");
75 for (_, v) in map.iter_mut() {
76 remove_private_spec_recursive(v);
77 }
78 }
79 Value::Array(arr) => {
80 for item in arr.iter_mut() {
81 remove_private_spec_recursive(item);
82 }
83 }
84 _ => {}
85 }
86}
87
88fn block_to_json_string(block: &GutenbergBlock) -> anyhow::Result<String> {
90 let mut json_value = serde_json::to_value(block)?;
91 remove_private_spec_recursive(&mut json_value);
92 Ok(serde_json::to_string(&json_value)?)
93}
94
95fn blocks_to_json_string(blocks: &[GutenbergBlock]) -> anyhow::Result<String> {
97 let mut json_value = serde_json::to_value(blocks)?;
98 remove_private_spec_recursive(&mut json_value);
99 Ok(serde_json::to_string(&json_value)?)
100}
101
102#[instrument(skip(blocks), fields(max_content_tokens))]
104pub fn split_blocks_into_chunks(
105 blocks: &[GutenbergBlock],
106 max_content_tokens: i32,
107) -> anyhow::Result<Vec<String>> {
108 debug!("Starting to split {} blocks into chunks", blocks.len());
109 let mut chunks: Vec<String> = Vec::new();
110 let mut current_chunk: Vec<GutenbergBlock> = Vec::new();
111 let mut current_chunk_tokens = 0;
112
113 for block in blocks {
114 let block_json = block_to_json_string(block)?;
115 let block_tokens = estimate_tokens(&block_json);
116 debug!(
117 "Processing block {} with {} tokens",
118 block.client_id, block_tokens
119 );
120
121 if block_tokens > max_content_tokens {
123 warn!(
124 "Block {} exceeds max token limit ({} > {})",
125 block.client_id, block_tokens, max_content_tokens
126 );
127 if !current_chunk.is_empty() {
129 chunks.push(blocks_to_json_string(¤t_chunk)?);
130 current_chunk = Vec::new();
131 current_chunk_tokens = 0;
132 }
133
134 split_oversized_block(&block_json, max_content_tokens, &mut chunks)?;
136 continue;
137 }
138
139 if current_chunk_tokens + block_tokens > max_content_tokens {
140 debug!(
141 "Creating new chunk after {} blocks ({} tokens)",
142 current_chunk.len(),
143 current_chunk_tokens
144 );
145 chunks.push(blocks_to_json_string(¤t_chunk)?);
146 current_chunk = Vec::new();
147 current_chunk_tokens = 0;
148 }
149
150 current_chunk.push(block.clone());
151 current_chunk_tokens += block_tokens;
152 }
153
154 if !current_chunk.is_empty() {
155 debug!(
156 "Adding final chunk with {} blocks ({} tokens)",
157 current_chunk.len(),
158 current_chunk_tokens
159 );
160 chunks.push(blocks_to_json_string(¤t_chunk)?);
161 }
162
163 Ok(chunks)
164}
165
166#[instrument(skip(block_json, chunks), fields(max_tokens))]
168fn split_oversized_block(
169 block_json: &str,
170 max_tokens: i32,
171 chunks: &mut Vec<String>,
172) -> anyhow::Result<()> {
173 let total_tokens = estimate_tokens(block_json);
174 debug!(
175 "Splitting oversized block with {} tokens into chunks of max {} tokens",
176 total_tokens, max_tokens
177 );
178
179 let max_tokens_safe = max_tokens.max(1);
182 let num_chunks = (total_tokens as f32 / (max_tokens_safe as f32 * 0.5)).ceil() as usize;
183
184 if num_chunks <= 1 || num_chunks == 0 {
185 chunks.push(block_json.to_string());
186 return Ok(());
187 }
188
189 let bytes_per_chunk = (block_json.len() / num_chunks).max(1);
192 debug!(
193 "Splitting into {} chunks of approximately {} bytes each",
194 num_chunks, bytes_per_chunk
195 );
196
197 let mut start = 0;
198 let mut iterations = 0;
199 const MAX_ITERATIONS: usize = 100;
200 while start < block_json.len() {
201 iterations += 1;
202 if iterations > MAX_ITERATIONS {
203 return Err(anyhow::anyhow!(
204 "Infinite loop protection: exceeded {} iterations in split_oversized_block",
205 MAX_ITERATIONS
206 ));
207 }
208
209 let end_candidate = start
211 .checked_add(bytes_per_chunk)
212 .unwrap_or(block_json.len())
213 .min(block_json.len());
214
215 let mut end = if end_candidate >= block_json.len() {
216 block_json.len()
217 } else {
218 end_candidate
219 };
220
221 while !block_json.is_char_boundary(end) && end > start {
223 end -= 1;
224 }
225
226 if end == start {
228 let mut next_boundary = start
230 .checked_add(1)
231 .unwrap_or(block_json.len())
232 .min(block_json.len());
233
234 let mut boundary_iterations = 0;
235 const MAX_BOUNDARY_ITERATIONS: usize = 100;
236 while next_boundary < block_json.len() && !block_json.is_char_boundary(next_boundary) {
237 boundary_iterations += 1;
238 if boundary_iterations > MAX_BOUNDARY_ITERATIONS {
239 return Err(anyhow::anyhow!(
240 "Infinite loop protection: exceeded {} iterations finding character boundary",
241 MAX_BOUNDARY_ITERATIONS
242 ));
243 }
244 next_boundary = next_boundary
245 .checked_add(1)
246 .unwrap_or(block_json.len())
247 .min(block_json.len());
248 }
249 end = next_boundary.min(block_json.len());
250 }
251
252 if end > start && end <= block_json.len() && start < block_json.len() {
254 let chunk = block_json.get(start..end).ok_or_else(|| {
256 anyhow::anyhow!("Invalid string slice bounds: {}..{}", start, end)
257 })?;
258 chunks.push(chunk.to_string());
259 let new_start = end;
260 if new_start <= start {
262 return Err(anyhow::anyhow!(
263 "Infinite loop protection: start did not advance ({} -> {})",
264 start,
265 new_start
266 ));
267 }
268 start = new_start;
269 } else {
270 if start < block_json.len() {
273 if let Some(remaining) = block_json.get(start..) {
274 if !remaining.is_empty() {
275 chunks.push(remaining.to_string());
276 }
277 }
278 }
279 break;
280 }
281 }
282
283 Ok(())
284}
285
286pub fn append_markdown_with_separator(result: &mut String, new_content: &str) {
288 if !result.is_empty() && !result.ends_with("\n\n") {
289 if result.ends_with('\n') {
290 result.push('\n');
291 } else {
292 result.push_str("\n\n");
293 }
294 }
295
296 result.push_str(new_content);
297}
298
299#[instrument(skip(chunks, system_message, app_config), fields(num_chunks = chunks.len()))]
301async fn process_chunks(
302 chunks: &[String],
303 system_message: &APIMessage,
304 app_config: &ApplicationConfiguration,
305) -> anyhow::Result<String> {
306 debug!("Processing {} chunks", chunks.len());
307 let mut result = String::new();
308
309 for (i, chunk) in chunks.iter().enumerate() {
310 debug!("Processing chunk {}/{}", i + 1, chunks.len());
311 let chunk_markdown = process_block_chunk(chunk, system_message, app_config).await?;
312 append_markdown_with_separator(&mut result, &chunk_markdown);
313 }
314
315 info!("Successfully cleaned content with LLM");
316 Ok(result)
317}
318
319#[instrument(skip(chunk, system_message, app_config), fields(chunk_tokens = estimate_tokens(chunk)))]
321async fn process_block_chunk(
322 chunk: &str,
323 system_message: &APIMessage,
324 app_config: &ApplicationConfiguration,
325) -> anyhow::Result<String> {
326 let messages = prepare_llm_messages(chunk, system_message)?;
327 let llm_base_request: LLMRequest = LLMRequest {
328 messages,
329 data_sources: vec![],
330 params: LLMRequestParams::NonThinking(NonThinkingParams {
331 temperature: Some(REQUEST_TEMPERATURE),
332 top_p: None,
333 frequency_penalty: None,
334 presence_penalty: None,
335 max_tokens: None,
336 }),
337 stop: None,
338 };
339 info!(
340 "Processing chunk of approximately {} tokens",
341 estimate_tokens(chunk)
342 );
343
344 let completion = match make_blocking_llm_request(llm_base_request, app_config).await {
345 Ok(completion) => completion,
346 Err(e) => {
347 error!("Failed to process chunk: {}", e);
348 return Err(e);
349 }
350 };
351
352 let cleaned_content = completion
353 .choices
354 .first()
355 .ok_or_else(|| {
356 error!("No content returned from LLM");
357 anyhow::anyhow!("No content returned from LLM")
358 })?
359 .message
360 .content
361 .clone();
362
363 Ok(cleaned_content)
364}
365
366pub fn prepare_llm_messages(
368 chunk: &str,
369 system_message: &APIMessage,
370) -> anyhow::Result<Vec<APIMessage>> {
371 let messages = vec![
372 system_message.clone(),
373 APIMessage {
374 role: MessageRole::User,
375 content: format!(
376 "{}\n\n{}{}\n{}",
377 USER_PROMPT_START, JSON_BEGIN_MARKER, chunk, JSON_END_MARKER
378 ),
379 },
380 ];
381
382 Ok(messages)
383}
384
385#[cfg(test)]
386mod tests {
387 use super::*;
388 use serde_json::json;
389
390 const TEST_BLOCK_NAME: &str = "test/block";
391
392 #[test]
393 fn test_calculate_safe_token_limit() {
394 assert_eq!(calculate_safe_token_limit(1000, 0.75), 750);
395 assert_eq!(calculate_safe_token_limit(16000, 0.75), 12000);
396 assert_eq!(calculate_safe_token_limit(8000, 0.5), 4000);
397 }
398
399 #[test]
400 fn test_append_markdown_with_separator() {
401 let mut result = String::new();
402 append_markdown_with_separator(&mut result, "New content");
403 assert_eq!(result, "New content");
404
405 let mut result = String::from("Existing content");
406 append_markdown_with_separator(&mut result, "New content");
407 assert_eq!(result, "Existing content\n\nNew content");
408
409 let mut result = String::from("Existing content\n");
410 append_markdown_with_separator(&mut result, "New content");
411 assert_eq!(result, "Existing content\n\nNew content");
412
413 let mut result = String::from("Existing content\n\n");
414 append_markdown_with_separator(&mut result, "New content");
415 assert_eq!(result, "Existing content\n\nNew content");
416 }
417
418 #[test]
419 fn test_split_blocks_into_chunks() -> anyhow::Result<()> {
420 let block1 = create_test_block("a "); let block2 = create_test_block("b b b b b b b b b b b b b b b b b b b b "); let block3 = create_test_block("c c c c c c c c c c c c c c c "); let blocks = vec![block1.clone(), block2.clone(), block3.clone()];
426
427 let t1 = estimate_tokens(&block_to_json_string(&block1)?);
429 let t2 = estimate_tokens(&block_to_json_string(&block2)?);
430 let t3 = estimate_tokens(&block_to_json_string(&block3)?);
431
432 let chunks = split_blocks_into_chunks(&blocks, t1 + t2 + t3 + 10)?;
434 assert_eq!(chunks.len(), 1);
435
436 let deserialized_chunk: Vec<GutenbergBlock> = serde_json::from_str(&chunks[0])?;
437 assert_eq!(deserialized_chunk.len(), 3);
438
439 let chunks = split_blocks_into_chunks(&blocks, t1 + 1)?;
441
442 let first_chunk: Vec<GutenbergBlock> = serde_json::from_str(&chunks[0])?;
444 assert_eq!(first_chunk.len(), 1);
445 assert_eq!(first_chunk[0].client_id, block1.client_id);
446
447 for chunk in &chunks[1..] {
450 assert!(!chunk.is_empty());
451 }
452
453 Ok(())
454 }
455
456 #[test]
457 fn test_prepare_llm_messages() -> anyhow::Result<()> {
458 let blocks = vec![create_test_block("Test content")];
459 let blocks_json = blocks_to_json_string(&blocks)?;
460 let system_message = APIMessage {
461 role: MessageRole::System,
462 content: "System prompt".to_string(),
463 };
464
465 let messages = prepare_llm_messages(&blocks_json, &system_message)?;
466
467 assert_eq!(messages.len(), 2);
468 assert_eq!(messages[0].role, MessageRole::System);
469 assert_eq!(messages[0].content, "System prompt");
470 assert_eq!(messages[1].role, MessageRole::User);
471 assert!(messages[1].content.contains(JSON_BEGIN_MARKER));
472 assert!(messages[1].content.contains("Test content"));
473
474 Ok(())
475 }
476
477 fn create_test_block(content: &str) -> GutenbergBlock {
478 let client_id = uuid::Uuid::new_v4();
479 GutenbergBlock {
480 client_id,
481 name: TEST_BLOCK_NAME.to_string(),
482 is_valid: true,
483 attributes: {
484 let mut map = serde_json::Map::new();
485 map.insert("content".to_string(), json!(content));
486 map
487 },
488 inner_blocks: vec![],
489 }
490 }
491}