1use crate::llm_utils::{Message, MessageRole, estimate_tokens, make_blocking_llm_request};
2use crate::prelude::*;
3use headless_lms_utils::document_schema_processor::GutenbergBlock;
4use tracing::{debug, error, info, instrument, warn};
5
6pub const MAX_CONTEXT_WINDOW: i32 = 16000;
8pub const MAX_CONTEXT_UTILIZATION: f32 = 0.75;
10pub const REQUEST_TEMPERATURE: f32 = 0.1;
12
13const JSON_BEGIN_MARKER: &str = "---BEGIN COURSE MATERIAL JSON---";
15const JSON_END_MARKER: &str = "---END COURSE MATERIAL JSON---";
16
17const SYSTEM_PROMPT: &str = r#"You are given course material in an abstract JSON format from a headless CMS. Convert this into clean, semantic Markdown that includes all user-visible content to support full-text search.
19
20* Extract and include all meaningful text content: paragraphs, headings, list items, image captions, and similar.
21* Retain any inline formatting (like bold or italic text), converting HTML tags (`<strong>`, `<em>`, etc.) into equivalent Markdown formatting.
22* For images, use the standard Markdown format: ``, including a caption if available.
23* Preserve heading levels (e.g., level 2 → `##`, level 3 → `###`).
24* Include text content from any block type, even non-standard ones, if it appears user-visible.
25* For exercise blocks, include the exercise name, and assignment instructions. You may also include text from the exercise specification (public spec), if it can be formatted into markdown.
26* If you encounter blocks that don't have any visible text in the JSON but are likely still user-visible (placeholder blocks) — e.g. `glossary`, `exercises-in-this-chapter`, `course-progress` — generate a fake heading representing the expected content (e.g. `## Glossary`).
27* Do not generate headings for placeholder blocks that are not user-visible — e.g. `conditionally-visible-content`, `spacer`, `divider`.
28* Exclude all purely stylistic attributes (e.g. colors, alignment, font sizes).
29* Do not include any metadata, HTML tags (other than for formatting), or non-visible fields.
30* Output **only the Markdown content**, and nothing else.
31"#;
32
33const USER_PROMPT_START: &str =
35 "Convert this JSON content to clean markdown. Output only the markdown, nothing else.";
36
37#[instrument(skip(blocks, app_config), fields(num_blocks = blocks.len()))]
39pub async fn convert_material_blocks_to_markdown_with_llm(
40 blocks: &[GutenbergBlock],
41 app_config: &ApplicationConfiguration,
42) -> anyhow::Result<String> {
43 debug!("Starting content conversion with {} blocks", blocks.len());
44 let system_message = Message {
45 role: MessageRole::System,
46 content: SYSTEM_PROMPT.to_string(),
47 };
48
49 let system_message_tokens = estimate_tokens(&system_message.content);
50 let safe_token_limit = calculate_safe_token_limit(MAX_CONTEXT_WINDOW, MAX_CONTEXT_UTILIZATION);
51 let max_content_tokens = safe_token_limit - system_message_tokens;
52
53 debug!(
54 "Token limits - system: {}, safe: {}, max content: {}",
55 system_message_tokens, safe_token_limit, max_content_tokens
56 );
57
58 let chunks = split_blocks_into_chunks(blocks, max_content_tokens)?;
59 debug!("Split content into {} chunks", chunks.len());
60 process_chunks(&chunks, &system_message, app_config).await
61}
62
63pub fn calculate_safe_token_limit(context_window: i32, utilization: f32) -> i32 {
65 (context_window as f32 * utilization) as i32
66}
67
68#[instrument(skip(blocks), fields(max_content_tokens))]
70pub fn split_blocks_into_chunks(
71 blocks: &[GutenbergBlock],
72 max_content_tokens: i32,
73) -> anyhow::Result<Vec<String>> {
74 debug!("Starting to split {} blocks into chunks", blocks.len());
75 let mut chunks: Vec<String> = Vec::new();
76 let mut current_chunk: Vec<GutenbergBlock> = Vec::new();
77 let mut current_chunk_tokens = 0;
78
79 for block in blocks {
80 let block_json = serde_json::to_string(block)?;
81 let block_tokens = estimate_tokens(&block_json);
82 debug!(
83 "Processing block {} with {} tokens",
84 block.client_id, block_tokens
85 );
86
87 if block_tokens > max_content_tokens {
89 warn!(
90 "Block {} exceeds max token limit ({} > {})",
91 block.client_id, block_tokens, max_content_tokens
92 );
93 if !current_chunk.is_empty() {
95 chunks.push(serde_json::to_string(¤t_chunk)?);
96 current_chunk = Vec::new();
97 current_chunk_tokens = 0;
98 }
99
100 split_oversized_block(&block_json, max_content_tokens, &mut chunks)?;
102 continue;
103 }
104
105 if current_chunk_tokens + block_tokens > max_content_tokens {
106 debug!(
107 "Creating new chunk after {} blocks ({} tokens)",
108 current_chunk.len(),
109 current_chunk_tokens
110 );
111 chunks.push(serde_json::to_string(¤t_chunk)?);
112 current_chunk = Vec::new();
113 current_chunk_tokens = 0;
114 }
115
116 current_chunk.push(block.clone());
117 current_chunk_tokens += block_tokens;
118 }
119
120 if !current_chunk.is_empty() {
121 debug!(
122 "Adding final chunk with {} blocks ({} tokens)",
123 current_chunk.len(),
124 current_chunk_tokens
125 );
126 chunks.push(serde_json::to_string(¤t_chunk)?);
127 }
128
129 Ok(chunks)
130}
131
132#[instrument(skip(block_json, chunks), fields(max_tokens))]
134fn split_oversized_block(
135 block_json: &str,
136 max_tokens: i32,
137 chunks: &mut Vec<String>,
138) -> anyhow::Result<()> {
139 let total_tokens = estimate_tokens(block_json);
140 debug!(
141 "Splitting oversized block with {} tokens into chunks of max {} tokens",
142 total_tokens, max_tokens
143 );
144
145 let num_chunks = (total_tokens as f32 / (max_tokens as f32 * 0.5)).ceil() as usize;
147
148 if num_chunks <= 1 {
149 chunks.push(block_json.to_string());
150 return Ok(());
151 }
152
153 let chars_per_chunk = block_json.len() / num_chunks;
154 debug!(
155 "Splitting into {} chunks of approximately {} chars each",
156 num_chunks, chars_per_chunk
157 );
158
159 let mut start = 0;
160 while start < block_json.len() {
161 let end = if start + chars_per_chunk >= block_json.len() {
162 block_json.len()
163 } else {
164 start + chars_per_chunk
165 };
166
167 let chunk = &block_json[start..end];
168 chunks.push(chunk.to_string());
169
170 start = end;
171 }
172
173 Ok(())
174}
175
176pub fn append_markdown_with_separator(result: &mut String, new_content: &str) {
178 if !result.is_empty() && !result.ends_with("\n\n") {
179 if result.ends_with('\n') {
180 result.push('\n');
181 } else {
182 result.push_str("\n\n");
183 }
184 }
185
186 result.push_str(new_content);
187}
188
189#[instrument(skip(chunks, system_message, app_config), fields(num_chunks = chunks.len()))]
191async fn process_chunks(
192 chunks: &[String],
193 system_message: &Message,
194 app_config: &ApplicationConfiguration,
195) -> anyhow::Result<String> {
196 debug!("Processing {} chunks", chunks.len());
197 let mut result = String::new();
198
199 for (i, chunk) in chunks.iter().enumerate() {
200 debug!("Processing chunk {}/{}", i + 1, chunks.len());
201 let chunk_markdown = process_block_chunk(chunk, system_message, app_config).await?;
202 append_markdown_with_separator(&mut result, &chunk_markdown);
203 }
204
205 info!("Successfully cleaned content with LLM");
206 Ok(result)
207}
208
209#[instrument(skip(chunk, system_message, app_config), fields(chunk_tokens = estimate_tokens(chunk)))]
211async fn process_block_chunk(
212 chunk: &str,
213 system_message: &Message,
214 app_config: &ApplicationConfiguration,
215) -> anyhow::Result<String> {
216 let messages = prepare_llm_messages(chunk, system_message)?;
217
218 info!(
219 "Processing chunk of approximately {} tokens",
220 estimate_tokens(chunk)
221 );
222
223 let completion =
224 match make_blocking_llm_request(messages, REQUEST_TEMPERATURE, None, app_config).await {
225 Ok(completion) => completion,
226 Err(e) => {
227 error!("Failed to process chunk: {}", e);
228 return Err(e);
229 }
230 };
231
232 let cleaned_content = completion
233 .choices
234 .first()
235 .ok_or_else(|| {
236 error!("No content returned from LLM");
237 anyhow::anyhow!("No content returned from LLM")
238 })?
239 .message
240 .content
241 .clone();
242
243 Ok(cleaned_content)
244}
245
246pub fn prepare_llm_messages(chunk: &str, system_message: &Message) -> anyhow::Result<Vec<Message>> {
248 let messages = vec![
249 system_message.clone(),
250 Message {
251 role: MessageRole::User,
252 content: format!(
253 "{}\n\n{}{}\n{}",
254 USER_PROMPT_START, JSON_BEGIN_MARKER, chunk, JSON_END_MARKER
255 ),
256 },
257 ];
258
259 Ok(messages)
260}
261
262#[cfg(test)]
263mod tests {
264 use super::*;
265 use serde_json::json;
266
267 const TEST_BLOCK_NAME: &str = "test/block";
268
269 #[test]
270 fn test_calculate_safe_token_limit() {
271 assert_eq!(calculate_safe_token_limit(1000, 0.75), 750);
272 assert_eq!(calculate_safe_token_limit(16000, 0.75), 12000);
273 assert_eq!(calculate_safe_token_limit(8000, 0.5), 4000);
274 }
275
276 #[test]
277 fn test_append_markdown_with_separator() {
278 let mut result = String::new();
279 append_markdown_with_separator(&mut result, "New content");
280 assert_eq!(result, "New content");
281
282 let mut result = String::from("Existing content");
283 append_markdown_with_separator(&mut result, "New content");
284 assert_eq!(result, "Existing content\n\nNew content");
285
286 let mut result = String::from("Existing content\n");
287 append_markdown_with_separator(&mut result, "New content");
288 assert_eq!(result, "Existing content\n\nNew content");
289
290 let mut result = String::from("Existing content\n\n");
291 append_markdown_with_separator(&mut result, "New content");
292 assert_eq!(result, "Existing content\n\nNew content");
293 }
294
295 #[test]
296 fn test_split_blocks_into_chunks() -> anyhow::Result<()> {
297 let block1 = create_test_block("a "); let block2 = create_test_block("b b b b b b b b b b b b b b b b b b b b "); let block3 = create_test_block("c c c c c c c c c c c c c c c "); let blocks = vec![block1.clone(), block2.clone(), block3.clone()];
303
304 let t1 = estimate_tokens(&serde_json::to_string(&block1)?);
306 let t2 = estimate_tokens(&serde_json::to_string(&block2)?);
307 let t3 = estimate_tokens(&serde_json::to_string(&block3)?);
308
309 let chunks = split_blocks_into_chunks(&blocks, t1 + t2 + t3 + 10)?;
311 assert_eq!(chunks.len(), 1);
312
313 let deserialized_chunk: Vec<GutenbergBlock> = serde_json::from_str(&chunks[0])?;
314 assert_eq!(deserialized_chunk.len(), 3);
315
316 let chunks = split_blocks_into_chunks(&blocks, t1 + 1)?;
318
319 let first_chunk: Vec<GutenbergBlock> = serde_json::from_str(&chunks[0])?;
321 assert_eq!(first_chunk.len(), 1);
322 assert_eq!(first_chunk[0].client_id, block1.client_id);
323
324 for chunk in &chunks[1..] {
327 assert!(!chunk.is_empty());
328 }
329
330 Ok(())
331 }
332
333 #[test]
334 fn test_prepare_llm_messages() -> anyhow::Result<()> {
335 let blocks = vec![create_test_block("Test content")];
336 let blocks_json = serde_json::to_string(&blocks)?;
337 let system_message = Message {
338 role: MessageRole::System,
339 content: "System prompt".to_string(),
340 };
341
342 let messages = prepare_llm_messages(&blocks_json, &system_message)?;
343
344 assert_eq!(messages.len(), 2);
345 assert_eq!(messages[0].role, MessageRole::System);
346 assert_eq!(messages[0].content, "System prompt");
347 assert_eq!(messages[1].role, MessageRole::User);
348 assert!(messages[1].content.contains(JSON_BEGIN_MARKER));
349 assert!(messages[1].content.contains("Test content"));
350
351 Ok(())
352 }
353
354 fn create_test_block(content: &str) -> GutenbergBlock {
355 let client_id = uuid::Uuid::new_v4();
356 GutenbergBlock {
357 client_id,
358 name: TEST_BLOCK_NAME.to_string(),
359 is_valid: true,
360 attributes: {
361 let mut map = serde_json::Map::new();
362 map.insert("content".to_string(), json!(content));
363 map
364 },
365 inner_blocks: vec![],
366 }
367 }
368}