godai78 · godai78 · Jun 12, 2025 · Jun 12, 2025 · Jun 12, 2025 · Jun 13, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,8 +2,9 @@
 
 All notable changes to this project will be documented in this file.
 
-## [0.5] - 2025-06-11
+## [0.6] - 2025-06-12
 ### Added
+- RTF input file support
 - Error messages (for wrong file format and missing headers levels)
 
 ## [0.4] - 2025-05-29

diff --git a/README.md b/README.md
@@ -1,12 +1,12 @@
-# DOCX Splitter v 0.5
+# DOCX Splitter v 0.6
 
-A web application that splits Microsoft Word `.docx` files by selected heading level and saves them as separate `.rtf` files.
+A web application that splits Microsoft Word `.docx` files and RTF files by selected heading level and saves them as separate `.rtf` files.
 
 ![Pepe Agent](pepeagent.gif)
 
 ## Features
 
-- Split `.docx` files by headings (H1-H6)
+- Split `.docx` and `.rtf` files by headings (H1-H6)
 - Select which heading level to split at (splits at selected level and all levels above it)
 - Clean and modern web interface
 - Automatic file naming based on headings

diff --git a/app.js b/app.js
@@ -8,19 +8,26 @@ document.addEventListener('DOMContentLoaded', () => {
 	splitButton.addEventListener('click', async () => {
 		const file = docxFile.files[0];
 		if (!file) {
-			setStatus('Please select a DOCX file first', 'error');
+			setStatus('Please select a file first', 'error');
 			return;
 		}
 
-		// Check if the file is a DOCX file
-		if (!file.name.toLowerCase().endsWith('.docx')) {
-			setStatus('Unsupported file format', 'error');
+		// Check if the file is a supported format
+		const fileExtension = file.name.toLowerCase().split('.').pop();
+		if (!['docx', 'rtf'].includes(fileExtension)) {
+			setStatus('Unsupported file format. Please use DOCX or RTF files.', 'error');
 			return;
 		}
 
 		try {
 			setStatus('Processing document...', 'info');
-			const content = await processDocx(file);
+			let content;
+
+			if (fileExtension === 'docx') {
+				content = await processDocx(file);
+			} else {
+				content = await processRtf(file);
+			}
 
 			// Check if there are headers of the selected level
 			if (!hasHeadersOfLevel(content, parseInt(headingLevel.value))) {
@@ -89,6 +96,134 @@ document.addEventListener('DOMContentLoaded', () => {
 		});
 	}
 
+	async function processRtf(file) {
+		const reader = new FileReader();
+		return new Promise((resolve, reject) => {
+			reader.onload = async (event) => {
+				try {
+					// Read as ArrayBuffer to preserve encoding
+					const arrayBuffer = event.target.result;
+
+					// First read the header to detect codepage
+					const headerView = new Uint8Array(arrayBuffer.slice(0, 1000));
+					const headerText = new TextDecoder('latin1').decode(headerView);
+
+					// Look for codepage declaration in RTF header
+					const codepageMatch = headerText.match(/\\ansicpg(\d+)/);
+					if (!codepageMatch) {
+						throw new Error('Could not detect codepage in RTF file');
+					}
+
+					const codepage = codepageMatch[1];
+					console.log('Detected codepage from RTF:', codepage);
+
+					// Set the global codepage for consistent encoding handling
+					globalCodepage = codepage;
+
+					// Read the entire file using the detected codepage
+					const decoder = new TextDecoder(`windows-${codepage}`);
+					const rtfContent = decoder.decode(arrayBuffer);
+
+					if (debug) console.log('=== RTF Processing Debug ===');
+
+					// Skip the RTF header and font table to get to the actual content
+					const contentStart = rtfContent.indexOf('\\viewkind');
+					if (contentStart === -1) {
+						throw new Error('Could not find document content in RTF file');
+					}
+
+					const documentContent = rtfContent.substring(contentStart);
+					console.log('Document content (first 2000 chars):', documentContent.substring(0, 2000));
+
+					// First, let's find all paragraphs
+					const paragraphs = documentContent.split('\\par');
+					console.log('Found paragraphs:', paragraphs.length);
+
+					let processedContent = '';
+					let headerCount = 0;
+
+					// Process each paragraph
+					paragraphs.forEach(paragraph => {
+						// Skip empty paragraphs
+						if (!paragraph.trim()) return;
+
+						// Check if this paragraph has header-like formatting
+						const hasBold = paragraph.includes('\\b');
+						const hasFontSize = paragraph.match(/\\fs(\d+)/);
+						const fontSize = hasFontSize ? parseInt(hasFontSize[1]) : 0;
+
+						// Check for bookmark sequences
+						const hasBookmarkStart = paragraph.includes('{\\*\\bkmkstart');
+						const hasBookmarkEnd = paragraph.includes('{\\*\\bkmkend');
+
+						// Extract the actual text, properly handling Unicode
+						let text = paragraph
+							.replace(/\\[a-z0-9]+\s?/g, '')
+							.replace(/{|}/g, '')
+							.replace(/^arsid\s*/i, '')  // Remove arsid heading if present
+							.replace(/{\\*\\bkmkstart[^}]*}/g, '')  // Remove bookmark start sequences
+							.replace(/{\\*\\bkmkend[^}]*}/g, '')    // Remove bookmark end sequences
+							.trim();
+
+						// Convert hex escape sequences to their proper characters
+						text = text.replace(/\\'([0-9a-f]{2})/gi, (match, hex) => {
+							const code = parseInt(hex, 16);
+							// Convert from Windows codepage to Unicode
+							const buffer = new Uint8Array([code]);
+							return new TextDecoder(`windows-${codepage}`).decode(buffer);
+						});
+
+						if (!text) return;
+
+						console.log('Processing paragraph:', {
+							text: text.substring(0, 50),
+							hasBold,
+							fontSize,
+							hasBookmarkStart,
+							hasBookmarkEnd,
+							original: paragraph.substring(0, 100)
+						});
+
+						// Determine if this is a header based on formatting or bookmarks
+						if (hasBold || fontSize >= 20 || (hasBookmarkStart && hasBookmarkEnd)) {
+							// Determine heading level based on font size
+							let level = 1;
+							if (fontSize >= 40) level = 1;
+							else if (fontSize >= 32) level = 2;
+							else if (fontSize >= 28) level = 3;
+							else if (fontSize >= 24) level = 4;
+							else if (fontSize >= 20) level = 5;
+							else level = 6;
+
+							console.log('Found header:', {
+								text,
+								level,
+								fontSize,
+								hasBookmarks: hasBookmarkStart && hasBookmarkEnd
+							});
+
+							processedContent += `<h${level}>${text}</h${level}>\n`;
+							headerCount++;
+						} else {
+							processedContent += `<p>${text}</p>\n`;
+						}
+					});
+
+					console.log('Total headers found:', headerCount);
+					console.log('Final HTML content:', processedContent);
+					console.log('=== End RTF Processing Debug ===');
+
+					resolve(processedContent);
+				} catch (error) {
+					console.error('Error processing RTF:', error);
+					reject(error);
+				}
+			};
+			reader.onerror = (error) => reject(error);
+			reader.readAsArrayBuffer(file);
+		});
+	}
+
 	// Global variable to store the codepage
 	let globalCodepage = "65001"; // Default to UTF-8
 
@@ -110,8 +245,14 @@ document.addEventListener('DOMContentLoaded', () => {
 					if (currentSection) {
 						sections.push(currentSection);
 					}
+
+					// Get the text content and remove any RTF artifacts
+					const title = element.textContent
+						.replace(/^[^a-zA-Z]*/, '')  // Remove everything before first letter
+						.trim();
+
 					currentSection = {
-						title: element.textContent,
+						title: title,
 						content: ''
 					};
 				} else if (currentSection) {
@@ -138,17 +279,31 @@ document.addEventListener('DOMContentLoaded', () => {
 	}
 
 	function hasHeadersOfLevel(html, targetLevel) {
+		console.log('=== Header Level Check Debug ===');
+		console.log('Target level:', targetLevel);
+		console.log('HTML content:', html);
+
 		const parser = new DOMParser();
 		const doc = parser.parseFromString(html, 'text/html');
 		const elements = Array.from(doc.body.childNodes);
 
-		return elements.some(element => {
+		console.log('Document elements:', elements.map(el => ({
+			tagName: el.tagName,
+			textContent: el.textContent?.substring(0, 50)
+		})));
+
+		const hasHeaders = elements.some(element => {
 			if (element.nodeType === Node.ELEMENT_NODE) {
 				const headingLevel = getHeadingLevel(element);
+				console.log('Element:', element.tagName, 'Level:', headingLevel, 'Text:', element.textContent?.substring(0, 50));
 				return headingLevel && headingLevel <= targetLevel;
 			}
 			return false;
 		});
+
+		console.log('Has headers:', hasHeaders);
+		console.log('=== End Header Level Check Debug ===');
+		return hasHeaders;
 	}
 
 	async function saveSectionsAsDocx(sections) {

diff --git a/index.html b/index.html
@@ -11,7 +11,7 @@
 			<h1>DOCX Splitter</h1>
 			<h3>Input docx and split every header into a new rtf file</h3>
 			<div class="upload-section">
-				<input type="file" id="docxFile" accept=".docx" />
+				<input type="file" id="docxFile" accept=".docx,.rtf" />
 				<select id="headingLevel" class="heading-select" title="Split at this level and all levels above it">
 					<option value="1" selected>Split at H1 only</option>
 					<option value="2">Split at H1 and H2</option>
@@ -30,6 +30,7 @@ <h3>Input docx and split every header into a new rtf file</h3>
 		</div>
 		<script src="https://cdnjs.cloudflare.com/ajax/libs/mammoth/1.5.0/mammoth.browser.min.js"></script>
 		<script src="https://cdnjs.cloudflare.com/ajax/libs/FileSaver.js/2.0.5/FileSaver.min.js"></script>
+		<script src="https://cdn.jsdelivr.net/npm/rtf-parser@1.0.0/dist/rtf-parser.min.js"></script>
 		<script src="app.js"></script>
 	</body>
 </html>
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -8,7 +8,8 @@
 	},
 	"dependencies": {
 		"mammoth": "^1.5.0",
-		"file-saver": "^2.0.5"
+		"file-saver": "^2.0.5",
+		"rtf-parser": "^1.0.0"
 	},
 	"devDependencies": {
 		"http-server": "^14.1.1"

diff --git a/user_manual.md b/user_manual.md
@@ -46,3 +46,8 @@ See [technical details](technical_details.md) for, uhm, details?
 * Why did you vibe-code it, AI is killing our market!
 
   It's killing mine, too, and the markets of plenty of my friends, but I just cannot devote several weeks to learn to code a tool I will use twice. Tried before and failed.
+
+## Error codes
+
+* `No headings of the selected size present in the document.` - the uploaded documents does not have headings of the specified level. For example, you chose H1 headings as a split point, but the document only has H2 or smaller headings.
+* `Unsupported file format.` - the file format you tried to upload is not supported.