diff --git a/CHANGELOG.md b/CHANGELOG.md index a339e94..6600a2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,8 +2,9 @@ All notable changes to this project will be documented in this file. -## [0.5] - 2025-06-11 +## [0.6] - 2025-06-12 ### Added +- RTF input file support - Error messages (for wrong file format and missing headers levels) ## [0.4] - 2025-05-29 diff --git a/README.md b/README.md index 403fbc2..263b7df 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ -# DOCX Splitter v 0.5 +# DOCX Splitter v 0.6 -A web application that splits Microsoft Word `.docx` files by selected heading level and saves them as separate `.rtf` files. +A web application that splits Microsoft Word `.docx` files and RTF files by selected heading level and saves them as separate `.rtf` files. ![Pepe Agent](pepeagent.gif) ## Features -- Split `.docx` files by headings (H1-H6) +- Split `.docx` and `.rtf` files by headings (H1-H6) - Select which heading level to split at (splits at selected level and all levels above it) - Clean and modern web interface - Automatic file naming based on headings diff --git a/app.js b/app.js index 1394d36..4a34417 100644 --- a/app.js +++ b/app.js @@ -8,19 +8,26 @@ document.addEventListener('DOMContentLoaded', () => { splitButton.addEventListener('click', async () => { const file = docxFile.files[0]; if (!file) { - setStatus('Please select a DOCX file first', 'error'); + setStatus('Please select a file first', 'error'); return; } - // Check if the file is a DOCX file - if (!file.name.toLowerCase().endsWith('.docx')) { - setStatus('Unsupported file format', 'error'); + // Check if the file is a supported format + const fileExtension = file.name.toLowerCase().split('.').pop(); + if (!['docx', 'rtf'].includes(fileExtension)) { + setStatus('Unsupported file format. Please use DOCX or RTF files.', 'error'); return; } try { setStatus('Processing document...', 'info'); - const content = await processDocx(file); + let content; + + if (fileExtension === 'docx') { + content = await processDocx(file); + } else { + content = await processRtf(file); + } // Check if there are headers of the selected level if (!hasHeadersOfLevel(content, parseInt(headingLevel.value))) { @@ -89,6 +96,134 @@ document.addEventListener('DOMContentLoaded', () => { }); } + async function processRtf(file) { + const reader = new FileReader(); + return new Promise((resolve, reject) => { + reader.onload = async (event) => { + try { + // Read as ArrayBuffer to preserve encoding + const arrayBuffer = event.target.result; + + // First read the header to detect codepage + const headerView = new Uint8Array(arrayBuffer.slice(0, 1000)); + const headerText = new TextDecoder('latin1').decode(headerView); + + // Look for codepage declaration in RTF header + const codepageMatch = headerText.match(/\\ansicpg(\d+)/); + if (!codepageMatch) { + throw new Error('Could not detect codepage in RTF file'); + } + + const codepage = codepageMatch[1]; + console.log('Detected codepage from RTF:', codepage); + + // Set the global codepage for consistent encoding handling + globalCodepage = codepage; + + // Read the entire file using the detected codepage + const decoder = new TextDecoder(`windows-${codepage}`); + const rtfContent = decoder.decode(arrayBuffer); + + if (debug) console.log('=== RTF Processing Debug ==='); + + // Skip the RTF header and font table to get to the actual content + const contentStart = rtfContent.indexOf('\\viewkind'); + if (contentStart === -1) { + throw new Error('Could not find document content in RTF file'); + } + + const documentContent = rtfContent.substring(contentStart); + console.log('Document content (first 2000 chars):', documentContent.substring(0, 2000)); + + // First, let's find all paragraphs + const paragraphs = documentContent.split('\\par'); + console.log('Found paragraphs:', paragraphs.length); + + let processedContent = ''; + let headerCount = 0; + + // Process each paragraph + paragraphs.forEach(paragraph => { + // Skip empty paragraphs + if (!paragraph.trim()) return; + + // Check if this paragraph has header-like formatting + const hasBold = paragraph.includes('\\b'); + const hasFontSize = paragraph.match(/\\fs(\d+)/); + const fontSize = hasFontSize ? parseInt(hasFontSize[1]) : 0; + + // Check for bookmark sequences + const hasBookmarkStart = paragraph.includes('{\\*\\bkmkstart'); + const hasBookmarkEnd = paragraph.includes('{\\*\\bkmkend'); + + // Extract the actual text, properly handling Unicode + let text = paragraph + .replace(/\\[a-z0-9]+\s?/g, '') + .replace(/{|}/g, '') + .replace(/^arsid\s*/i, '') // Remove arsid heading if present + .replace(/{\\*\\bkmkstart[^}]*}/g, '') // Remove bookmark start sequences + .replace(/{\\*\\bkmkend[^}]*}/g, '') // Remove bookmark end sequences + .trim(); + + // Convert hex escape sequences to their proper characters + text = text.replace(/\\'([0-9a-f]{2})/gi, (match, hex) => { + const code = parseInt(hex, 16); + // Convert from Windows codepage to Unicode + const buffer = new Uint8Array([code]); + return new TextDecoder(`windows-${codepage}`).decode(buffer); + }); + + if (!text) return; + + console.log('Processing paragraph:', { + text: text.substring(0, 50), + hasBold, + fontSize, + hasBookmarkStart, + hasBookmarkEnd, + original: paragraph.substring(0, 100) + }); + + // Determine if this is a header based on formatting or bookmarks + if (hasBold || fontSize >= 20 || (hasBookmarkStart && hasBookmarkEnd)) { + // Determine heading level based on font size + let level = 1; + if (fontSize >= 40) level = 1; + else if (fontSize >= 32) level = 2; + else if (fontSize >= 28) level = 3; + else if (fontSize >= 24) level = 4; + else if (fontSize >= 20) level = 5; + else level = 6; + + console.log('Found header:', { + text, + level, + fontSize, + hasBookmarks: hasBookmarkStart && hasBookmarkEnd + }); + + processedContent += `${text}\n`; + headerCount++; + } else { + processedContent += `

${text}

\n`; + } + }); + + console.log('Total headers found:', headerCount); + console.log('Final HTML content:', processedContent); + console.log('=== End RTF Processing Debug ==='); + + resolve(processedContent); + } catch (error) { + console.error('Error processing RTF:', error); + reject(error); + } + }; + reader.onerror = (error) => reject(error); + reader.readAsArrayBuffer(file); + }); + } + // Global variable to store the codepage let globalCodepage = "65001"; // Default to UTF-8 @@ -110,8 +245,14 @@ document.addEventListener('DOMContentLoaded', () => { if (currentSection) { sections.push(currentSection); } + + // Get the text content and remove any RTF artifacts + const title = element.textContent + .replace(/^[^a-zA-Z]*/, '') // Remove everything before first letter + .trim(); + currentSection = { - title: element.textContent, + title: title, content: '' }; } else if (currentSection) { @@ -138,17 +279,31 @@ document.addEventListener('DOMContentLoaded', () => { } function hasHeadersOfLevel(html, targetLevel) { + console.log('=== Header Level Check Debug ==='); + console.log('Target level:', targetLevel); + console.log('HTML content:', html); + const parser = new DOMParser(); const doc = parser.parseFromString(html, 'text/html'); const elements = Array.from(doc.body.childNodes); - return elements.some(element => { + console.log('Document elements:', elements.map(el => ({ + tagName: el.tagName, + textContent: el.textContent?.substring(0, 50) + }))); + + const hasHeaders = elements.some(element => { if (element.nodeType === Node.ELEMENT_NODE) { const headingLevel = getHeadingLevel(element); + console.log('Element:', element.tagName, 'Level:', headingLevel, 'Text:', element.textContent?.substring(0, 50)); return headingLevel && headingLevel <= targetLevel; } return false; }); + + console.log('Has headers:', hasHeaders); + console.log('=== End Header Level Check Debug ==='); + return hasHeaders; } async function saveSectionsAsDocx(sections) { diff --git a/index.html b/index.html index 1014169..f844a56 100644 --- a/index.html +++ b/index.html @@ -11,7 +11,7 @@

DOCX Splitter

Input docx and split every header into a new rtf file

- +