diff --git a/CHANGELOG.md b/CHANGELOG.md
index a339e94..6600a2e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,8 +2,9 @@
All notable changes to this project will be documented in this file.
-## [0.5] - 2025-06-11
+## [0.6] - 2025-06-12
### Added
+- RTF input file support
- Error messages (for wrong file format and missing headers levels)
## [0.4] - 2025-05-29
diff --git a/README.md b/README.md
index 403fbc2..263b7df 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,12 @@
-# DOCX Splitter v 0.5
+# DOCX Splitter v 0.6
-A web application that splits Microsoft Word `.docx` files by selected heading level and saves them as separate `.rtf` files.
+A web application that splits Microsoft Word `.docx` files and RTF files by selected heading level and saves them as separate `.rtf` files.

## Features
-- Split `.docx` files by headings (H1-H6)
+- Split `.docx` and `.rtf` files by headings (H1-H6)
- Select which heading level to split at (splits at selected level and all levels above it)
- Clean and modern web interface
- Automatic file naming based on headings
diff --git a/app.js b/app.js
index 1394d36..4a34417 100644
--- a/app.js
+++ b/app.js
@@ -8,19 +8,26 @@ document.addEventListener('DOMContentLoaded', () => {
splitButton.addEventListener('click', async () => {
const file = docxFile.files[0];
if (!file) {
- setStatus('Please select a DOCX file first', 'error');
+ setStatus('Please select a file first', 'error');
return;
}
- // Check if the file is a DOCX file
- if (!file.name.toLowerCase().endsWith('.docx')) {
- setStatus('Unsupported file format', 'error');
+ // Check if the file is a supported format
+ const fileExtension = file.name.toLowerCase().split('.').pop();
+ if (!['docx', 'rtf'].includes(fileExtension)) {
+ setStatus('Unsupported file format. Please use DOCX or RTF files.', 'error');
return;
}
try {
setStatus('Processing document...', 'info');
- const content = await processDocx(file);
+ let content;
+
+ if (fileExtension === 'docx') {
+ content = await processDocx(file);
+ } else {
+ content = await processRtf(file);
+ }
// Check if there are headers of the selected level
if (!hasHeadersOfLevel(content, parseInt(headingLevel.value))) {
@@ -89,6 +96,134 @@ document.addEventListener('DOMContentLoaded', () => {
});
}
+ async function processRtf(file) {
+ const reader = new FileReader();
+ return new Promise((resolve, reject) => {
+ reader.onload = async (event) => {
+ try {
+ // Read as ArrayBuffer to preserve encoding
+ const arrayBuffer = event.target.result;
+
+ // First read the header to detect codepage
+ const headerView = new Uint8Array(arrayBuffer.slice(0, 1000));
+ const headerText = new TextDecoder('latin1').decode(headerView);
+
+ // Look for codepage declaration in RTF header
+ const codepageMatch = headerText.match(/\\ansicpg(\d+)/);
+ if (!codepageMatch) {
+ throw new Error('Could not detect codepage in RTF file');
+ }
+
+ const codepage = codepageMatch[1];
+ console.log('Detected codepage from RTF:', codepage);
+
+ // Set the global codepage for consistent encoding handling
+ globalCodepage = codepage;
+
+ // Read the entire file using the detected codepage
+ const decoder = new TextDecoder(`windows-${codepage}`);
+ const rtfContent = decoder.decode(arrayBuffer);
+
+ if (debug) console.log('=== RTF Processing Debug ===');
+
+ // Skip the RTF header and font table to get to the actual content
+ const contentStart = rtfContent.indexOf('\\viewkind');
+ if (contentStart === -1) {
+ throw new Error('Could not find document content in RTF file');
+ }
+
+ const documentContent = rtfContent.substring(contentStart);
+ console.log('Document content (first 2000 chars):', documentContent.substring(0, 2000));
+
+ // First, let's find all paragraphs
+ const paragraphs = documentContent.split('\\par');
+ console.log('Found paragraphs:', paragraphs.length);
+
+ let processedContent = '';
+ let headerCount = 0;
+
+ // Process each paragraph
+ paragraphs.forEach(paragraph => {
+ // Skip empty paragraphs
+ if (!paragraph.trim()) return;
+
+ // Check if this paragraph has header-like formatting
+ const hasBold = paragraph.includes('\\b');
+ const hasFontSize = paragraph.match(/\\fs(\d+)/);
+ const fontSize = hasFontSize ? parseInt(hasFontSize[1]) : 0;
+
+ // Check for bookmark sequences
+ const hasBookmarkStart = paragraph.includes('{\\*\\bkmkstart');
+ const hasBookmarkEnd = paragraph.includes('{\\*\\bkmkend');
+
+ // Extract the actual text, properly handling Unicode
+ let text = paragraph
+ .replace(/\\[a-z0-9]+\s?/g, '')
+ .replace(/{|}/g, '')
+ .replace(/^arsid\s*/i, '') // Remove arsid heading if present
+ .replace(/{\\*\\bkmkstart[^}]*}/g, '') // Remove bookmark start sequences
+ .replace(/{\\*\\bkmkend[^}]*}/g, '') // Remove bookmark end sequences
+ .trim();
+
+ // Convert hex escape sequences to their proper characters
+ text = text.replace(/\\'([0-9a-f]{2})/gi, (match, hex) => {
+ const code = parseInt(hex, 16);
+ // Convert from Windows codepage to Unicode
+ const buffer = new Uint8Array([code]);
+ return new TextDecoder(`windows-${codepage}`).decode(buffer);
+ });
+
+ if (!text) return;
+
+ console.log('Processing paragraph:', {
+ text: text.substring(0, 50),
+ hasBold,
+ fontSize,
+ hasBookmarkStart,
+ hasBookmarkEnd,
+ original: paragraph.substring(0, 100)
+ });
+
+ // Determine if this is a header based on formatting or bookmarks
+ if (hasBold || fontSize >= 20 || (hasBookmarkStart && hasBookmarkEnd)) {
+ // Determine heading level based on font size
+ let level = 1;
+ if (fontSize >= 40) level = 1;
+ else if (fontSize >= 32) level = 2;
+ else if (fontSize >= 28) level = 3;
+ else if (fontSize >= 24) level = 4;
+ else if (fontSize >= 20) level = 5;
+ else level = 6;
+
+ console.log('Found header:', {
+ text,
+ level,
+ fontSize,
+ hasBookmarks: hasBookmarkStart && hasBookmarkEnd
+ });
+
+ processedContent += `
${text}
\n`; + } + }); + + console.log('Total headers found:', headerCount); + console.log('Final HTML content:', processedContent); + console.log('=== End RTF Processing Debug ==='); + + resolve(processedContent); + } catch (error) { + console.error('Error processing RTF:', error); + reject(error); + } + }; + reader.onerror = (error) => reject(error); + reader.readAsArrayBuffer(file); + }); + } + // Global variable to store the codepage let globalCodepage = "65001"; // Default to UTF-8 @@ -110,8 +245,14 @@ document.addEventListener('DOMContentLoaded', () => { if (currentSection) { sections.push(currentSection); } + + // Get the text content and remove any RTF artifacts + const title = element.textContent + .replace(/^[^a-zA-Z]*/, '') // Remove everything before first letter + .trim(); + currentSection = { - title: element.textContent, + title: title, content: '' }; } else if (currentSection) { @@ -138,17 +279,31 @@ document.addEventListener('DOMContentLoaded', () => { } function hasHeadersOfLevel(html, targetLevel) { + console.log('=== Header Level Check Debug ==='); + console.log('Target level:', targetLevel); + console.log('HTML content:', html); + const parser = new DOMParser(); const doc = parser.parseFromString(html, 'text/html'); const elements = Array.from(doc.body.childNodes); - return elements.some(element => { + console.log('Document elements:', elements.map(el => ({ + tagName: el.tagName, + textContent: el.textContent?.substring(0, 50) + }))); + + const hasHeaders = elements.some(element => { if (element.nodeType === Node.ELEMENT_NODE) { const headingLevel = getHeadingLevel(element); + console.log('Element:', element.tagName, 'Level:', headingLevel, 'Text:', element.textContent?.substring(0, 50)); return headingLevel && headingLevel <= targetLevel; } return false; }); + + console.log('Has headers:', hasHeaders); + console.log('=== End Header Level Check Debug ==='); + return hasHeaders; } async function saveSectionsAsDocx(sections) { diff --git a/index.html b/index.html index 1014169..f844a56 100644 --- a/index.html +++ b/index.html @@ -11,7 +11,7 @@