From 68cac95fa495c17596fdb9b77bb4feee7d7ff1eb Mon Sep 17 00:00:00 2001
From: Bartek Biedrzycki <b.biedrzycki@cksource.com>
Date: Thu, 12 Jun 2025 12:27:13 +0200
Subject: [PATCH 1/6] Adding RTF input.

---
 CHANGELOG.md      |   6 +++
 README.md         |   6 +--
 app.js            | 121 +++++++++++++++++++++++++++++++++++++++++++---
 index.html        |   3 +-
 package-lock.json |  26 +++++++++-
 package.json      |   3 +-
 6 files changed, 152 insertions(+), 13 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a339e94..de83ef3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,12 @@
 
 All notable changes to this project will be documented in this file.
 
+## [0.6] - 2025-06-12
+### Added
+- RTF input file support
+- RTF to HTML conversion for processing
+- Support for RTF formatting (bold, italic, underline)
+
 ## [0.5] - 2025-06-11
 ### Added
 - Error messages (for wrong file format and missing headers levels)
diff --git a/README.md b/README.md
index 403fbc2..263b7df 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,12 @@
-# DOCX Splitter v 0.5
+# DOCX Splitter v 0.6
 
-A web application that splits Microsoft Word `.docx` files by selected heading level and saves them as separate `.rtf` files.
+A web application that splits Microsoft Word `.docx` files and RTF files by selected heading level and saves them as separate `.rtf` files.
 
 ![Pepe Agent](pepeagent.gif)
 
 ## Features
 
-- Split `.docx` files by headings (H1-H6)
+- Split `.docx` and `.rtf` files by headings (H1-H6)
 - Select which heading level to split at (splits at selected level and all levels above it)
 - Clean and modern web interface
 - Automatic file naming based on headings
diff --git a/app.js b/app.js
index 1394d36..a3bf81f 100644
--- a/app.js
+++ b/app.js
@@ -8,19 +8,26 @@ document.addEventListener('DOMContentLoaded', () => {
 	splitButton.addEventListener('click', async () => {
 		const file = docxFile.files[0];
 		if (!file) {
-			setStatus('Please select a DOCX file first', 'error');
+			setStatus('Please select a file first', 'error');
 			return;
 		}
 
-		// Check if the file is a DOCX file
-		if (!file.name.toLowerCase().endsWith('.docx')) {
-			setStatus('Unsupported file format', 'error');
+		// Check if the file is a supported format
+		const fileExtension = file.name.toLowerCase().split('.').pop();
+		if (!['docx', 'rtf'].includes(fileExtension)) {
+			setStatus('Unsupported file format. Please use DOCX or RTF files.', 'error');
 			return;
 		}
 
 		try {
 			setStatus('Processing document...', 'info');
-			const content = await processDocx(file);
+			let content;
+			
+			if (fileExtension === 'docx') {
+				content = await processDocx(file);
+			} else {
+				content = await processRtf(file);
+			}
 			
 			// Check if there are headers of the selected level
 			if (!hasHeadersOfLevel(content, parseInt(headingLevel.value))) {
@@ -89,6 +96,94 @@ document.addEventListener('DOMContentLoaded', () => {
 		});
 	}
 
+	async function processRtf(file) {
+		const reader = new FileReader();
+		return new Promise((resolve, reject) => {
+			reader.onload = async (event) => {
+				try {
+					const rtfContent = event.target.result;
+					console.log('=== RTF Processing Debug ===');
+					
+					// Skip the RTF header and font table to get to the actual content
+					const contentStart = rtfContent.indexOf('\\viewkind');
+					if (contentStart === -1) {
+						throw new Error('Could not find document content in RTF file');
+					}
+					
+					const documentContent = rtfContent.substring(contentStart);
+					console.log('Document content (first 2000 chars):', documentContent.substring(0, 2000));
+					
+					// First, let's find all paragraphs
+					const paragraphs = documentContent.split('\\par');
+					console.log('Found paragraphs:', paragraphs.length);
+					
+					let processedContent = '';
+					let headerCount = 0;
+					
+					// Process each paragraph
+					paragraphs.forEach(paragraph => {
+						// Skip empty paragraphs
+						if (!paragraph.trim()) return;
+						
+						// Check if this paragraph has header-like formatting
+						const hasBold = paragraph.includes('\\b');
+						const hasFontSize = paragraph.match(/\\fs(\d+)/);
+						const fontSize = hasFontSize ? parseInt(hasFontSize[1]) : 0;
+						
+						// Extract the actual text
+						const text = paragraph
+							.replace(/\\[a-z0-9]+\s?/g, '')
+							.replace(/{|}/g, '')
+							.trim();
+						
+						if (!text) return;
+						
+						console.log('Processing paragraph:', {
+							text: text.substring(0, 50),
+							hasBold,
+							fontSize,
+							original: paragraph.substring(0, 100)
+						});
+						
+						// Determine if this is a header based on formatting
+						if (hasBold || fontSize >= 20) {
+							// Determine heading level based on font size
+							let level = 1;
+							if (fontSize >= 40) level = 1;
+							else if (fontSize >= 32) level = 2;
+							else if (fontSize >= 28) level = 3;
+							else if (fontSize >= 24) level = 4;
+							else if (fontSize >= 20) level = 5;
+							else level = 6;
+							
+							console.log('Found header:', {
+								text,
+								level,
+								fontSize
+							});
+							
+							processedContent += `<h${level}>${text}</h${level}>\n`;
+							headerCount++;
+						} else {
+							processedContent += `<p>${text}</p>\n`;
+						}
+					});
+					
+					console.log('Total headers found:', headerCount);
+					console.log('Final HTML content:', processedContent);
+					console.log('=== End RTF Processing Debug ===');
+					
+					resolve(processedContent);
+				} catch (error) {
+					console.error('Error processing RTF:', error);
+					reject(error);
+				}
+			};
+			reader.onerror = (error) => reject(error);
+			reader.readAsText(file);
+		});
+	}
+
 	// Global variable to store the codepage
 	let globalCodepage = "65001"; // Default to UTF-8
 
@@ -138,17 +233,31 @@ document.addEventListener('DOMContentLoaded', () => {
 	}
 
 	function hasHeadersOfLevel(html, targetLevel) {
+		console.log('=== Header Level Check Debug ===');
+		console.log('Target level:', targetLevel);
+		console.log('HTML content:', html);
+		
 		const parser = new DOMParser();
 		const doc = parser.parseFromString(html, 'text/html');
 		const elements = Array.from(doc.body.childNodes);
 		
-		return elements.some(element => {
+		console.log('Document elements:', elements.map(el => ({
+			tagName: el.tagName,
+			textContent: el.textContent?.substring(0, 50)
+		})));
+		
+		const hasHeaders = elements.some(element => {
 			if (element.nodeType === Node.ELEMENT_NODE) {
 				const headingLevel = getHeadingLevel(element);
+				console.log('Element:', element.tagName, 'Level:', headingLevel, 'Text:', element.textContent?.substring(0, 50));
 				return headingLevel && headingLevel <= targetLevel;
 			}
 			return false;
 		});
+		
+		console.log('Has headers:', hasHeaders);
+		console.log('=== End Header Level Check Debug ===');
+		return hasHeaders;
 	}
 
 	async function saveSectionsAsDocx(sections) {
diff --git a/index.html b/index.html
index 1014169..f844a56 100644
--- a/index.html
+++ b/index.html
@@ -11,7 +11,7 @@
 			<h1>DOCX Splitter</h1>
 			<h3>Input docx and split every header into a new rtf file</h3>
 			<div class="upload-section">
-				<input type="file" id="docxFile" accept=".docx" />
+				<input type="file" id="docxFile" accept=".docx,.rtf" />
 				<select id="headingLevel" class="heading-select" title="Split at this level and all levels above it">
 					<option value="1" selected>Split at H1 only</option>
 					<option value="2">Split at H1 and H2</option>
@@ -30,6 +30,7 @@ <h3>Input docx and split every header into a new rtf file</h3>
 		</div>
 		<script src="https://cdnjs.cloudflare.com/ajax/libs/mammoth/1.5.0/mammoth.browser.min.js"></script>
 		<script src="https://cdnjs.cloudflare.com/ajax/libs/FileSaver.js/2.0.5/FileSaver.min.js"></script>
+		<script src="https://cdn.jsdelivr.net/npm/rtf-parser@1.0.0/dist/rtf-parser.min.js"></script>
 		<script src="app.js"></script>
 	</body>
 </html>
diff --git a/package-lock.json b/package-lock.json
index 4120867..4e0799a 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -9,7 +9,8 @@
 			"version": "1.0.0",
 			"dependencies": {
 				"file-saver": "^2.0.5",
-				"mammoth": "^1.5.0"
+				"mammoth": "^1.5.0",
+				"rtf-parser": "^1.0.0"
 			},
 			"devDependencies": {
 				"http-server": "^14.1.1"
@@ -687,6 +688,28 @@
 			"dev": true,
 			"license": "MIT"
 		},
+		"node_modules/rtf-parser": {
+			"version": "1.3.3",
+			"resolved": "https://registry.npmjs.org/rtf-parser/-/rtf-parser-1.3.3.tgz",
+			"integrity": "sha512-sz2eb4tcCFtwVfs5Ei/l3JnSQGqpDv+drFuNz/zwn2tA24cL2WTuk2VMo2bA4IcRgkn38juAOri2hB9nv85u2Q==",
+			"license": "ISC",
+			"dependencies": {
+				"iconv-lite": "^0.4.15",
+				"readable-stream": "^2.2.2"
+			}
+		},
+		"node_modules/rtf-parser/node_modules/iconv-lite": {
+			"version": "0.4.24",
+			"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz",
+			"integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==",
+			"license": "MIT",
+			"dependencies": {
+				"safer-buffer": ">= 2.1.2 < 3"
+			},
+			"engines": {
+				"node": ">=0.10.0"
+			}
+		},
 		"node_modules/safe-buffer": {
 			"version": "5.1.2",
 			"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
@@ -697,7 +720,6 @@
 			"version": "2.1.2",
 			"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
 			"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
-			"dev": true,
 			"license": "MIT"
 		},
 		"node_modules/secure-compare": {
diff --git a/package.json b/package.json
index 6a4aa93..bc3ec9b 100644
--- a/package.json
+++ b/package.json
@@ -8,7 +8,8 @@
 	},
 	"dependencies": {
 		"mammoth": "^1.5.0",
-		"file-saver": "^2.0.5"
+		"file-saver": "^2.0.5",
+		"rtf-parser": "^1.0.0"
 	},
 	"devDependencies": {
 		"http-server": "^14.1.1"

From 07688d91183c9923c5e6c4b092a95fc3f6ec504a Mon Sep 17 00:00:00 2001
From: Bartek Biedrzycki <b.biedrzycki@cksource.com>
Date: Thu, 12 Jun 2025 13:06:30 +0200
Subject: [PATCH 2/6] RTF encoding.

---
 app.js | 46 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 41 insertions(+), 5 deletions(-)

diff --git a/app.js b/app.js
index a3bf81f..5d3beb8 100644
--- a/app.js
+++ b/app.js
@@ -101,7 +101,29 @@ document.addEventListener('DOMContentLoaded', () => {
 		return new Promise((resolve, reject) => {
 			reader.onload = async (event) => {
 				try {
-					const rtfContent = event.target.result;
+					// Read as ArrayBuffer to preserve encoding
+					const arrayBuffer = event.target.result;
+					
+					// First read the header to detect codepage
+					const headerView = new Uint8Array(arrayBuffer.slice(0, 1000));
+					const headerText = new TextDecoder('latin1').decode(headerView);
+					
+					// Look for codepage declaration in RTF header
+					const codepageMatch = headerText.match(/\\ansicpg(\d+)/);
+					if (!codepageMatch) {
+						throw new Error('Could not detect codepage in RTF file');
+					}
+					
+					const codepage = codepageMatch[1];
+					console.log('Detected codepage from RTF:', codepage);
+					
+					// Set the global codepage for consistent encoding handling
+					globalCodepage = codepage;
+					
+					// Read the entire file using the detected codepage
+					const decoder = new TextDecoder(`windows-${codepage}`);
+					const rtfContent = decoder.decode(arrayBuffer);
+					
 					console.log('=== RTF Processing Debug ===');
 					
 					// Skip the RTF header and font table to get to the actual content
@@ -130,12 +152,20 @@ document.addEventListener('DOMContentLoaded', () => {
 						const hasFontSize = paragraph.match(/\\fs(\d+)/);
 						const fontSize = hasFontSize ? parseInt(hasFontSize[1]) : 0;
 						
-						// Extract the actual text
-						const text = paragraph
+						// Extract the actual text, properly handling Unicode
+						let text = paragraph
 							.replace(/\\[a-z0-9]+\s?/g, '')
 							.replace(/{|}/g, '')
 							.trim();
 						
+						// Convert hex escape sequences to their proper characters
+						text = text.replace(/\\'([0-9a-f]{2})/gi, (match, hex) => {
+							const code = parseInt(hex, 16);
+							// Convert from Windows codepage to Unicode
+							const buffer = new Uint8Array([code]);
+							return new TextDecoder(`windows-${codepage}`).decode(buffer);
+						});
+						
 						if (!text) return;
 						
 						console.log('Processing paragraph:', {
@@ -180,7 +210,7 @@ document.addEventListener('DOMContentLoaded', () => {
 				}
 			};
 			reader.onerror = (error) => reject(error);
-			reader.readAsText(file);
+			reader.readAsArrayBuffer(file);
 		});
 	}
 
@@ -205,8 +235,14 @@ document.addEventListener('DOMContentLoaded', () => {
 					if (currentSection) {
 						sections.push(currentSection);
 					}
+					
+					// Get the text content and remove any RTF artifacts
+					const title = element.textContent
+						.replace(/^[^a-zA-Z]*/, '')  // Remove everything before first letter
+						.trim();
+					
 					currentSection = {
-						title: element.textContent,
+						title: title,
 						content: ''
 					};
 				} else if (currentSection) {

From a053a3b5e87961d5a2674b6e1fab4dff6038a456 Mon Sep 17 00:00:00 2001
From: Bartek Biedrzycki <b.biedrzycki@cksource.com>
Date: Thu, 12 Jun 2025 13:33:52 +0200
Subject: [PATCH 3/6] Docs.

---
 CHANGELOG.md   | 5 -----
 user_manual.md | 5 +++++
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index de83ef3..6600a2e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,11 +5,6 @@ All notable changes to this project will be documented in this file.
 ## [0.6] - 2025-06-12
 ### Added
 - RTF input file support
-- RTF to HTML conversion for processing
-- Support for RTF formatting (bold, italic, underline)
-
-## [0.5] - 2025-06-11
-### Added
 - Error messages (for wrong file format and missing headers levels)
 
 ## [0.4] - 2025-05-29
diff --git a/user_manual.md b/user_manual.md
index 7eb36cd..a89a2db 100644
--- a/user_manual.md
+++ b/user_manual.md
@@ -46,3 +46,8 @@ See [technical details](technical_details.md) for, uhm, details?
 * Why did you vibe-code it, AI is killing our market!
 
   It's killing mine, too, and the markets of plenty of my friends, but I just cannot devote several weeks to learn to code a tool I will use twice. Tried before and failed.
+
+## Error codes
+
+* `No headings of the selected size present in the document.` - the uploaded documents does not have headings of the specified level. For example, you chose H1 headings as a split point, but the document only has H2 or smaller headings.
+* `Unsupported file format.` - the file format you tried to upload is not supported.
\ No newline at end of file

From 0478c0683c84a4fdc0704c0b7e928553a6ff2ac3 Mon Sep 17 00:00:00 2001
From: Bartek Biedrzycki <b.biedrzycki@cksource.com>
Date: Fri, 13 Jun 2025 07:12:48 +0200
Subject: [PATCH 4/6] Remove arsid.

---
 app.js | 1 +
 1 file changed, 1 insertion(+)

diff --git a/app.js b/app.js
index 5d3beb8..a538ebc 100644
--- a/app.js
+++ b/app.js
@@ -156,6 +156,7 @@ document.addEventListener('DOMContentLoaded', () => {
 						let text = paragraph
 							.replace(/\\[a-z0-9]+\s?/g, '')
 							.replace(/{|}/g, '')
+							.replace(/^arsid\s*/i, '')  // Remove arsid heading if present
 							.trim();
 						
 						// Convert hex escape sequences to their proper characters

From 30883cabf43d1ae0adaf413189085de724b2745f Mon Sep 17 00:00:00 2001
From: Bartek Biedrzycki <b.biedrzycki@cksource.com>
Date: Fri, 13 Jun 2025 07:57:10 +0200
Subject: [PATCH 5/6] Leaving the code for later.

---
 app.js | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/app.js b/app.js
index a538ebc..33e00a5 100644
--- a/app.js
+++ b/app.js
@@ -152,11 +152,17 @@ document.addEventListener('DOMContentLoaded', () => {
 						const hasFontSize = paragraph.match(/\\fs(\d+)/);
 						const fontSize = hasFontSize ? parseInt(hasFontSize[1]) : 0;
 						
+						// Check for bookmark sequences
+						const hasBookmarkStart = paragraph.includes('{\\*\\bkmkstart');
+						const hasBookmarkEnd = paragraph.includes('{\\*\\bkmkend');
+						
 						// Extract the actual text, properly handling Unicode
 						let text = paragraph
 							.replace(/\\[a-z0-9]+\s?/g, '')
 							.replace(/{|}/g, '')
 							.replace(/^arsid\s*/i, '')  // Remove arsid heading if present
+							.replace(/{\\*\\bkmkstart[^}]*}/g, '')  // Remove bookmark start sequences
+							.replace(/{\\*\\bkmkend[^}]*}/g, '')    // Remove bookmark end sequences
 							.trim();
 						
 						// Convert hex escape sequences to their proper characters
@@ -173,11 +179,13 @@ document.addEventListener('DOMContentLoaded', () => {
 							text: text.substring(0, 50),
 							hasBold,
 							fontSize,
+							hasBookmarkStart,
+							hasBookmarkEnd,
 							original: paragraph.substring(0, 100)
 						});
 						
-						// Determine if this is a header based on formatting
-						if (hasBold || fontSize >= 20) {
+						// Determine if this is a header based on formatting or bookmarks
+						if (hasBold || fontSize >= 20 || (hasBookmarkStart && hasBookmarkEnd)) {
 							// Determine heading level based on font size
 							let level = 1;
 							if (fontSize >= 40) level = 1;
@@ -190,7 +198,8 @@ document.addEventListener('DOMContentLoaded', () => {
 							console.log('Found header:', {
 								text,
 								level,
-								fontSize
+								fontSize,
+								hasBookmarks: hasBookmarkStart && hasBookmarkEnd
 							});
 							
 							processedContent += `<h${level}>${text}</h${level}>\n`;

From f93531044d984bb00892f1f8cb5f133833ece870 Mon Sep 17 00:00:00 2001
From: Bartek Biedrzycki <68123541+godai78@users.noreply.github.com>
Date: Fri, 13 Jun 2025 11:35:13 +0200
Subject: [PATCH 6/6] Update app.js

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 app.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app.js b/app.js
index 33e00a5..4a34417 100644
--- a/app.js
+++ b/app.js
@@ -124,7 +124,7 @@ document.addEventListener('DOMContentLoaded', () => {
 					const decoder = new TextDecoder(`windows-${codepage}`);
 					const rtfContent = decoder.decode(arrayBuffer);
 					
-					console.log('=== RTF Processing Debug ===');
+					if (debug) console.log('=== RTF Processing Debug ===');
 					
 					// Skip the RTF header and font table to get to the actual content
 					const contentStart = rtfContent.indexOf('\\viewkind');