1use crate::azure_chatbot::{LLMRequest, LLMRequestParams, NonThinkingParams, ThinkingParams};
2use crate::llm_utils::{
3 APIMessage, APIMessageKind, APIMessageText, estimate_tokens, make_blocking_llm_request,
4 parse_text_completion,
5};
6use crate::prelude::*;
7use headless_lms_models::application_task_default_language_models::TaskLMSpec;
8use headless_lms_models::chatbot_conversation_messages::MessageRole;
9use headless_lms_utils::document_schema_processor::GutenbergBlock;
10use serde_json::Value;
11use tracing::{debug, error, info, instrument, warn};
12
13pub const REQUEST_TEMPERATURE: f32 = 0.1;
15
16const JSON_BEGIN_MARKER: &str = "---BEGIN COURSE MATERIAL JSON---";
18const JSON_END_MARKER: &str = "---END COURSE MATERIAL JSON---";
19
20const SYSTEM_PROMPT: &str = r#"You are given course material in an abstract JSON format from a headless CMS. Convert this into clean, semantic Markdown that includes all user-visible content to support full-text search.
22
23* Extract and include all meaningful text content: paragraphs, headings, list items, image captions, and similar.
24* Retain any inline formatting (like bold or italic text), converting HTML tags (`<strong>`, `<em>`, etc.) into equivalent Markdown formatting.
25* For images, use the standard Markdown format: ``, including a caption if available.
26* Preserve heading levels (e.g., level 2 → `##`, level 3 → `###`).
27* Include text content from any block type, even non-standard ones, if it appears user-visible.
28* For exercise blocks, include the exercise name, and assignment instructions. You may also include text from the exercise specification (public spec), if it can be formatted into markdown.
29* If you encounter blocks that don't have any visible text in the JSON but are likely still user-visible (placeholder blocks) — e.g. `glossary`, `exercises-in-this-chapter`, `course-progress` — generate a fake heading representing the expected content (e.g. `## Glossary`).
30* Do not generate headings for placeholder blocks that are not user-visible — e.g. `conditionally-visible-content`, `spacer`, `divider`.
31* Exclude all purely stylistic attributes (e.g. colors, alignment, font sizes).
32* Do not include any metadata, HTML tags (other than for formatting), or non-visible fields.
33* Output **only the Markdown content**, and nothing else.
34"#;
35
36const USER_PROMPT_START: &str =
38 "Convert this JSON content to clean markdown. Output only the markdown, nothing else.";
39
40#[instrument(skip(blocks, app_config, task_lm), fields(num_blocks = blocks.len()))]
42pub async fn convert_material_blocks_to_markdown_with_llm(
43 blocks: &[GutenbergBlock],
44 app_config: &ApplicationConfiguration,
45 task_lm: &TaskLMSpec,
46) -> anyhow::Result<String> {
47 debug!("Starting content conversion with {} blocks", blocks.len());
48 let system_message = APIMessage {
49 role: MessageRole::System,
50 fields: APIMessageKind::Text(APIMessageText {
51 content: SYSTEM_PROMPT.to_string(),
52 }),
53 };
54
55 let system_message_tokens = estimate_tokens(SYSTEM_PROMPT);
56 let safe_token_limit =
57 calculate_safe_token_limit(task_lm.context_size, task_lm.context_utilization);
58 let max_content_tokens = (safe_token_limit - system_message_tokens).max(1);
59
60 debug!(
61 "Token limits - system: {}, safe: {}, max content: {}",
62 system_message_tokens, safe_token_limit, max_content_tokens
63 );
64
65 let chunks = split_blocks_into_chunks(blocks, max_content_tokens)?;
66 debug!("Split content into {} chunks", chunks.len());
67 process_chunks(&chunks, &system_message, app_config, task_lm).await
68}
69
70pub fn calculate_safe_token_limit(context_window: i32, utilization: f32) -> i32 {
72 (context_window as f32 * utilization) as i32
73}
74
75fn remove_private_spec_recursive(value: &mut Value) {
77 match value {
78 Value::Object(map) => {
79 map.remove("private_spec");
80 for (_, v) in map.iter_mut() {
81 remove_private_spec_recursive(v);
82 }
83 }
84 Value::Array(arr) => {
85 for item in arr.iter_mut() {
86 remove_private_spec_recursive(item);
87 }
88 }
89 _ => {}
90 }
91}
92
93fn block_to_json_string(block: &GutenbergBlock) -> anyhow::Result<String> {
95 let mut json_value = serde_json::to_value(block)?;
96 remove_private_spec_recursive(&mut json_value);
97 Ok(serde_json::to_string(&json_value)?)
98}
99
100fn blocks_to_json_string(blocks: &[GutenbergBlock]) -> anyhow::Result<String> {
102 let mut json_value = serde_json::to_value(blocks)?;
103 remove_private_spec_recursive(&mut json_value);
104 Ok(serde_json::to_string(&json_value)?)
105}
106
107#[instrument(skip(blocks), fields(max_content_tokens))]
109pub fn split_blocks_into_chunks(
110 blocks: &[GutenbergBlock],
111 max_content_tokens: i32,
112) -> anyhow::Result<Vec<String>> {
113 debug!("Starting to split {} blocks into chunks", blocks.len());
114 let mut chunks: Vec<String> = Vec::new();
115 let mut current_chunk: Vec<GutenbergBlock> = Vec::new();
116 let mut current_chunk_tokens = 0;
117
118 for block in blocks {
119 let block_json = block_to_json_string(block)?;
120 let block_tokens = estimate_tokens(&block_json);
121 debug!(
122 "Processing block {} with {} tokens",
123 block.client_id, block_tokens
124 );
125
126 if block_tokens > max_content_tokens {
128 warn!(
129 "Block {} exceeds max token limit ({} > {})",
130 block.client_id, block_tokens, max_content_tokens
131 );
132 if !current_chunk.is_empty() {
134 chunks.push(blocks_to_json_string(¤t_chunk)?);
135 current_chunk = Vec::new();
136 current_chunk_tokens = 0;
137 }
138
139 split_oversized_block(&block_json, max_content_tokens, &mut chunks)?;
141 continue;
142 }
143
144 if current_chunk_tokens + block_tokens > max_content_tokens {
145 debug!(
146 "Creating new chunk after {} blocks ({} tokens)",
147 current_chunk.len(),
148 current_chunk_tokens
149 );
150 chunks.push(blocks_to_json_string(¤t_chunk)?);
151 current_chunk = Vec::new();
152 current_chunk_tokens = 0;
153 }
154
155 current_chunk.push(block.clone());
156 current_chunk_tokens += block_tokens;
157 }
158
159 if !current_chunk.is_empty() {
160 debug!(
161 "Adding final chunk with {} blocks ({} tokens)",
162 current_chunk.len(),
163 current_chunk_tokens
164 );
165 chunks.push(blocks_to_json_string(¤t_chunk)?);
166 }
167
168 Ok(chunks)
169}
170
171#[instrument(skip(block_json, chunks), fields(max_tokens))]
173fn split_oversized_block(
174 block_json: &str,
175 max_tokens: i32,
176 chunks: &mut Vec<String>,
177) -> anyhow::Result<()> {
178 let total_tokens = estimate_tokens(block_json);
179 debug!(
180 "Splitting oversized block with {} tokens into chunks of max {} tokens",
181 total_tokens, max_tokens
182 );
183
184 let max_tokens_safe = max_tokens.max(1);
187 let num_chunks = (total_tokens as f32 / (max_tokens_safe as f32 * 0.5)).ceil() as usize;
188
189 if num_chunks <= 1 || num_chunks == 0 {
190 chunks.push(block_json.to_string());
191 return Ok(());
192 }
193
194 let bytes_per_chunk = (block_json.len() / num_chunks).max(1);
197 debug!(
198 "Splitting into {} chunks of approximately {} bytes each",
199 num_chunks, bytes_per_chunk
200 );
201
202 let mut start = 0;
203 let mut iterations = 0;
204 const MAX_ITERATIONS: usize = 100;
205 while start < block_json.len() {
206 iterations += 1;
207 if iterations > MAX_ITERATIONS {
208 return Err(anyhow::anyhow!(
209 "Infinite loop protection: exceeded {} iterations in split_oversized_block",
210 MAX_ITERATIONS
211 ));
212 }
213
214 let end_candidate = start
216 .checked_add(bytes_per_chunk)
217 .unwrap_or(block_json.len())
218 .min(block_json.len());
219
220 let mut end = if end_candidate >= block_json.len() {
221 block_json.len()
222 } else {
223 end_candidate
224 };
225
226 while !block_json.is_char_boundary(end) && end > start {
228 end -= 1;
229 }
230
231 if end == start {
233 let mut next_boundary = start
235 .checked_add(1)
236 .unwrap_or(block_json.len())
237 .min(block_json.len());
238
239 let mut boundary_iterations = 0;
240 const MAX_BOUNDARY_ITERATIONS: usize = 100;
241 while next_boundary < block_json.len() && !block_json.is_char_boundary(next_boundary) {
242 boundary_iterations += 1;
243 if boundary_iterations > MAX_BOUNDARY_ITERATIONS {
244 return Err(anyhow::anyhow!(
245 "Infinite loop protection: exceeded {} iterations finding character boundary",
246 MAX_BOUNDARY_ITERATIONS
247 ));
248 }
249 next_boundary = next_boundary
250 .checked_add(1)
251 .unwrap_or(block_json.len())
252 .min(block_json.len());
253 }
254 end = next_boundary.min(block_json.len());
255 }
256
257 if end > start && end <= block_json.len() && start < block_json.len() {
259 let chunk = block_json.get(start..end).ok_or_else(|| {
261 anyhow::anyhow!("Invalid string slice bounds: {}..{}", start, end)
262 })?;
263 chunks.push(chunk.to_string());
264 let new_start = end;
265 if new_start <= start {
267 return Err(anyhow::anyhow!(
268 "Infinite loop protection: start did not advance ({} -> {})",
269 start,
270 new_start
271 ));
272 }
273 start = new_start;
274 } else {
275 if start < block_json.len()
278 && let Some(remaining) = block_json.get(start..)
279 && !remaining.is_empty()
280 {
281 chunks.push(remaining.to_string());
282 }
283 break;
284 }
285 }
286
287 Ok(())
288}
289
290pub fn append_markdown_with_separator(result: &mut String, new_content: &str) {
292 if !result.is_empty() && !result.ends_with("\n\n") {
293 if result.ends_with('\n') {
294 result.push('\n');
295 } else {
296 result.push_str("\n\n");
297 }
298 }
299
300 result.push_str(new_content);
301}
302
303#[instrument(skip(chunks, system_message, app_config, task_lm), fields(num_chunks = chunks.len()))]
305async fn process_chunks(
306 chunks: &[String],
307 system_message: &APIMessage,
308 app_config: &ApplicationConfiguration,
309 task_lm: &TaskLMSpec,
310) -> anyhow::Result<String> {
311 debug!("Processing {} chunks", chunks.len());
312 let mut result = String::new();
313
314 for (i, chunk) in chunks.iter().enumerate() {
315 debug!("Processing chunk {}/{}", i + 1, chunks.len());
316 let chunk_markdown =
317 process_block_chunk(chunk, system_message, app_config, task_lm).await?;
318 append_markdown_with_separator(&mut result, &chunk_markdown);
319 }
320
321 info!("Successfully cleaned content with LLM");
322 Ok(result)
323}
324
325#[instrument(skip(chunk, system_message, app_config, task_lm), fields(chunk_tokens = estimate_tokens(chunk)))]
327async fn process_block_chunk(
328 chunk: &str,
329 system_message: &APIMessage,
330 app_config: &ApplicationConfiguration,
331 task_lm: &TaskLMSpec,
332) -> ChatbotResult<String> {
333 let messages = prepare_llm_messages(chunk, system_message)?;
334 let params = if task_lm.thinking {
335 LLMRequestParams::Thinking(ThinkingParams {
336 max_completion_tokens: None,
337 verbosity: None,
338 reasoning_effort: None,
339 tools: vec![],
340 tool_choice: None,
341 })
342 } else {
343 LLMRequestParams::NonThinking(NonThinkingParams {
344 temperature: Some(REQUEST_TEMPERATURE),
345 top_p: None,
346 frequency_penalty: None,
347 presence_penalty: None,
348 max_tokens: None,
349 })
350 };
351 let llm_base_request: LLMRequest = LLMRequest {
352 messages,
353 data_sources: vec![],
354 params,
355 response_format: None,
356 stop: None,
357 };
358 info!(
359 "Processing chunk of approximately {} tokens",
360 estimate_tokens(chunk)
361 );
362
363 let completion = match make_blocking_llm_request(llm_base_request, app_config, task_lm).await {
364 Ok(completion) => completion,
365 Err(e) => {
366 error!("Failed to process chunk: {}", e);
367 return Err(ChatbotError::from(e));
368 }
369 };
370
371 parse_text_completion(completion)
372}
373
374pub fn prepare_llm_messages(
376 chunk: &str,
377 system_message: &APIMessage,
378) -> anyhow::Result<Vec<APIMessage>> {
379 let messages = vec![
380 system_message.clone(),
381 APIMessage {
382 role: MessageRole::User,
383 fields: APIMessageKind::Text(APIMessageText {
384 content: format!(
385 "{}\n\n{}{}\n{}",
386 USER_PROMPT_START, JSON_BEGIN_MARKER, chunk, JSON_END_MARKER
387 ),
388 }),
389 },
390 ];
391
392 Ok(messages)
393}
394
395#[cfg(test)]
396mod tests {
397 use super::*;
398 use crate::llm_utils::{APIMessageKind, APIMessageText};
399 use serde_json::json;
400
401 const TEST_BLOCK_NAME: &str = "test/block";
402
403 #[test]
404 fn test_calculate_safe_token_limit() {
405 assert_eq!(calculate_safe_token_limit(1000, 0.75), 750);
406 assert_eq!(calculate_safe_token_limit(16000, 0.75), 12000);
407 assert_eq!(calculate_safe_token_limit(8000, 0.5), 4000);
408 }
409
410 #[test]
411 fn test_append_markdown_with_separator() {
412 let mut result = String::new();
413 append_markdown_with_separator(&mut result, "New content");
414 assert_eq!(result, "New content");
415
416 let mut result = String::from("Existing content");
417 append_markdown_with_separator(&mut result, "New content");
418 assert_eq!(result, "Existing content\n\nNew content");
419
420 let mut result = String::from("Existing content\n");
421 append_markdown_with_separator(&mut result, "New content");
422 assert_eq!(result, "Existing content\n\nNew content");
423
424 let mut result = String::from("Existing content\n\n");
425 append_markdown_with_separator(&mut result, "New content");
426 assert_eq!(result, "Existing content\n\nNew content");
427 }
428
429 #[test]
430 fn test_split_blocks_into_chunks() -> anyhow::Result<()> {
431 let block1 = create_test_block("a "); let block2 = create_test_block("b b b b b b b b b b b b b b b b b b b b "); let block3 = create_test_block("c c c c c c c c c c c c c c c "); let blocks = vec![block1.clone(), block2.clone(), block3.clone()];
437
438 let t1 = estimate_tokens(&block_to_json_string(&block1)?);
440 let t2 = estimate_tokens(&block_to_json_string(&block2)?);
441 let t3 = estimate_tokens(&block_to_json_string(&block3)?);
442
443 let chunks = split_blocks_into_chunks(&blocks, t1 + t2 + t3 + 10)?;
445 assert_eq!(chunks.len(), 1);
446
447 let deserialized_chunk: Vec<GutenbergBlock> = serde_json::from_str(&chunks[0])?;
448 assert_eq!(deserialized_chunk.len(), 3);
449
450 let chunks = split_blocks_into_chunks(&blocks, t1 + 1)?;
452
453 let first_chunk: Vec<GutenbergBlock> = serde_json::from_str(&chunks[0])?;
455 assert_eq!(first_chunk.len(), 1);
456 assert_eq!(first_chunk[0].client_id, block1.client_id);
457
458 for chunk in &chunks[1..] {
461 assert!(!chunk.is_empty());
462 }
463
464 Ok(())
465 }
466
467 #[test]
468 fn test_prepare_llm_messages() -> anyhow::Result<()> {
469 let blocks = vec![create_test_block("Test content")];
470 let blocks_json = blocks_to_json_string(&blocks)?;
471 let system_message = APIMessage {
472 role: MessageRole::System,
473 fields: APIMessageKind::Text(APIMessageText {
474 content: "System prompt".to_string(),
475 }),
476 };
477
478 let messages = prepare_llm_messages(&blocks_json, &system_message)?;
479
480 assert_eq!(messages.len(), 2);
481 let msg1_content = match &messages[0].fields {
482 APIMessageKind::Text(msg) => &msg.content,
483 _ => "",
484 };
485 let msg2_content = match &messages[1].fields {
486 APIMessageKind::Text(msg) => &msg.content,
487 _ => "",
488 };
489 assert_eq!(messages[0].role, MessageRole::System);
490 assert_eq!(msg1_content, "System prompt");
491 assert_eq!(messages[1].role, MessageRole::User);
492 assert!(msg2_content.contains(JSON_BEGIN_MARKER));
493 assert!(msg2_content.contains("Test content"));
494
495 Ok(())
496 }
497
498 fn create_test_block(content: &str) -> GutenbergBlock {
499 let client_id = uuid::Uuid::new_v4();
500 GutenbergBlock {
501 client_id,
502 name: TEST_BLOCK_NAME.to_string(),
503 is_valid: true,
504 attributes: {
505 let mut map = serde_json::Map::new();
506 map.insert("content".to_string(), json!(content));
507 map
508 },
509 inner_blocks: vec![],
510 }
511 }
512}