From 68cac95fa495c17596fdb9b77bb4feee7d7ff1eb Mon Sep 17 00:00:00 2001 From: Bartek Biedrzycki Date: Thu, 12 Jun 2025 12:27:13 +0200 Subject: [PATCH 1/6] Adding RTF input. --- CHANGELOG.md | 6 +++ README.md | 6 +-- app.js | 121 +++++++++++++++++++++++++++++++++++++++++++--- index.html | 3 +- package-lock.json | 26 +++++++++- package.json | 3 +- 6 files changed, 152 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a339e94..de83ef3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ All notable changes to this project will be documented in this file. +## [0.6] - 2025-06-12 +### Added +- RTF input file support +- RTF to HTML conversion for processing +- Support for RTF formatting (bold, italic, underline) + ## [0.5] - 2025-06-11 ### Added - Error messages (for wrong file format and missing headers levels) diff --git a/README.md b/README.md index 403fbc2..263b7df 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ -# DOCX Splitter v 0.5 +# DOCX Splitter v 0.6 -A web application that splits Microsoft Word `.docx` files by selected heading level and saves them as separate `.rtf` files. +A web application that splits Microsoft Word `.docx` files and RTF files by selected heading level and saves them as separate `.rtf` files. ![Pepe Agent](pepeagent.gif) ## Features -- Split `.docx` files by headings (H1-H6) +- Split `.docx` and `.rtf` files by headings (H1-H6) - Select which heading level to split at (splits at selected level and all levels above it) - Clean and modern web interface - Automatic file naming based on headings diff --git a/app.js b/app.js index 1394d36..a3bf81f 100644 --- a/app.js +++ b/app.js @@ -8,19 +8,26 @@ document.addEventListener('DOMContentLoaded', () => { splitButton.addEventListener('click', async () => { const file = docxFile.files[0]; if (!file) { - setStatus('Please select a DOCX file first', 'error'); + setStatus('Please select a file first', 'error'); return; } - // Check if the file is a DOCX file - if (!file.name.toLowerCase().endsWith('.docx')) { - setStatus('Unsupported file format', 'error'); + // Check if the file is a supported format + const fileExtension = file.name.toLowerCase().split('.').pop(); + if (!['docx', 'rtf'].includes(fileExtension)) { + setStatus('Unsupported file format. Please use DOCX or RTF files.', 'error'); return; } try { setStatus('Processing document...', 'info'); - const content = await processDocx(file); + let content; + + if (fileExtension === 'docx') { + content = await processDocx(file); + } else { + content = await processRtf(file); + } // Check if there are headers of the selected level if (!hasHeadersOfLevel(content, parseInt(headingLevel.value))) { @@ -89,6 +96,94 @@ document.addEventListener('DOMContentLoaded', () => { }); } + async function processRtf(file) { + const reader = new FileReader(); + return new Promise((resolve, reject) => { + reader.onload = async (event) => { + try { + const rtfContent = event.target.result; + console.log('=== RTF Processing Debug ==='); + + // Skip the RTF header and font table to get to the actual content + const contentStart = rtfContent.indexOf('\\viewkind'); + if (contentStart === -1) { + throw new Error('Could not find document content in RTF file'); + } + + const documentContent = rtfContent.substring(contentStart); + console.log('Document content (first 2000 chars):', documentContent.substring(0, 2000)); + + // First, let's find all paragraphs + const paragraphs = documentContent.split('\\par'); + console.log('Found paragraphs:', paragraphs.length); + + let processedContent = ''; + let headerCount = 0; + + // Process each paragraph + paragraphs.forEach(paragraph => { + // Skip empty paragraphs + if (!paragraph.trim()) return; + + // Check if this paragraph has header-like formatting + const hasBold = paragraph.includes('\\b'); + const hasFontSize = paragraph.match(/\\fs(\d+)/); + const fontSize = hasFontSize ? parseInt(hasFontSize[1]) : 0; + + // Extract the actual text + const text = paragraph + .replace(/\\[a-z0-9]+\s?/g, '') + .replace(/{|}/g, '') + .trim(); + + if (!text) return; + + console.log('Processing paragraph:', { + text: text.substring(0, 50), + hasBold, + fontSize, + original: paragraph.substring(0, 100) + }); + + // Determine if this is a header based on formatting + if (hasBold || fontSize >= 20) { + // Determine heading level based on font size + let level = 1; + if (fontSize >= 40) level = 1; + else if (fontSize >= 32) level = 2; + else if (fontSize >= 28) level = 3; + else if (fontSize >= 24) level = 4; + else if (fontSize >= 20) level = 5; + else level = 6; + + console.log('Found header:', { + text, + level, + fontSize + }); + + processedContent += `${text}\n`; + headerCount++; + } else { + processedContent += `

${text}

\n`; + } + }); + + console.log('Total headers found:', headerCount); + console.log('Final HTML content:', processedContent); + console.log('=== End RTF Processing Debug ==='); + + resolve(processedContent); + } catch (error) { + console.error('Error processing RTF:', error); + reject(error); + } + }; + reader.onerror = (error) => reject(error); + reader.readAsText(file); + }); + } + // Global variable to store the codepage let globalCodepage = "65001"; // Default to UTF-8 @@ -138,17 +233,31 @@ document.addEventListener('DOMContentLoaded', () => { } function hasHeadersOfLevel(html, targetLevel) { + console.log('=== Header Level Check Debug ==='); + console.log('Target level:', targetLevel); + console.log('HTML content:', html); + const parser = new DOMParser(); const doc = parser.parseFromString(html, 'text/html'); const elements = Array.from(doc.body.childNodes); - return elements.some(element => { + console.log('Document elements:', elements.map(el => ({ + tagName: el.tagName, + textContent: el.textContent?.substring(0, 50) + }))); + + const hasHeaders = elements.some(element => { if (element.nodeType === Node.ELEMENT_NODE) { const headingLevel = getHeadingLevel(element); + console.log('Element:', element.tagName, 'Level:', headingLevel, 'Text:', element.textContent?.substring(0, 50)); return headingLevel && headingLevel <= targetLevel; } return false; }); + + console.log('Has headers:', hasHeaders); + console.log('=== End Header Level Check Debug ==='); + return hasHeaders; } async function saveSectionsAsDocx(sections) { diff --git a/index.html b/index.html index 1014169..f844a56 100644 --- a/index.html +++ b/index.html @@ -11,7 +11,7 @@

DOCX Splitter

Input docx and split every header into a new rtf file

- +