From 004cac3bf5fc9da55422a96d20416b46810ee390 Mon Sep 17 00:00:00 2001 From: Sylvain Jermini Date: Tue, 19 May 2026 21:36:57 +0200 Subject: [PATCH 1/6] pre-process string input --- .../jfiveparse/ProcessedInputStream.java | 73 ++++++++++--------- .../jfiveparse/ResizableCharBuilder.java | 8 ++ 2 files changed, 47 insertions(+), 34 deletions(-) diff --git a/src/main/java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java b/src/main/java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java index 75826c1..a341982 100644 --- a/src/main/java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java +++ b/src/main/java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java @@ -17,20 +17,13 @@ import java.io.IOException; import java.io.Reader; +import java.util.Arrays; /** - * Even though the html5 specification is working with codepoints, this input - * stream will only emit chars and "-1". - * - * This has some interesting consequences that we will need to fully explore: - * + * Wrapped and abstracted input. Can most likely be optimized. */ abstract class ProcessedInputStream { - private boolean crFound; protected final ResizableIntBuffer buffer = new ResizableIntBuffer(); protected abstract int read(); @@ -40,22 +33,40 @@ static class StringProcessedInputStream extends ProcessedInputStream { protected final char[] input; StringProcessedInputStream(String input) { - this.input = input.toCharArray(); + this.input = normalize(input); + } + + private static char[] normalize(String s) { + char[] arr = s.toCharArray(); + int n = arr.length; + int j = 0; + for (int i = 0; i < n; i++) { + char c = arr[i]; + if (c == '\r') { + arr[j++] = '\n'; + if (i + 1 < n && arr[i + 1] == '\n') { + i++; + } + } else { + arr[j++] = c; + } + } + return j == n ? arr : Arrays.copyOf(arr, j); } @Override protected int read() { - try { + if (pos < input.length) { return input[pos++]; - } catch (IndexOutOfBoundsException s) { - return -1; } + return -1; } } static final class ReaderProcessedInputStream extends ProcessedInputStream { private final Reader reader; + private boolean crFound; ReaderProcessedInputStream(Reader reader) { this.reader = reader; @@ -64,7 +75,19 @@ static final class ReaderProcessedInputStream extends ProcessedInputStream { @Override protected int read() { try { - return reader.read(); + int chr = reader.read(); + if (crFound) { + crFound = false; + if (chr == Characters.LF) { + chr = reader.read(); + } + } + + if (chr == Characters.CR) { + crFound = true; + chr = Characters.LF; + } + return chr; } catch (IOException ioe) { throw new ParserException(ioe); } @@ -72,29 +95,11 @@ protected int read() { } // - private int readWithCRHandling() { - int chr = read(); - if (crFound) { - //chr = handleCrFoundInternal(chr); - crFound = false; - if (chr == Characters.LF) { - chr = read(); - } - } - - if (chr == Characters.CR) { - // handleChrIsCR - crFound = true; - chr = Characters.LF; - } - return chr; - } - int peekNextInputCharacter(int offset) { if (buffer.length() < offset) { // fill buffer for (int i = buffer.length(); i < offset; i++) { - buffer.add(readWithCRHandling()); + buffer.add(read()); } } return buffer.getCharAt(offset); @@ -111,7 +116,7 @@ int getNextInputCharacterAndConsume() { } int consume() { - return buffer.isEmpty ? readWithCRHandling() : buffer.removeFirst(); + return buffer.isEmpty ? read() : buffer.removeFirst(); } void reconsume(int chr) { diff --git a/src/main/java/ch/digitalfondue/jfiveparse/ResizableCharBuilder.java b/src/main/java/ch/digitalfondue/jfiveparse/ResizableCharBuilder.java index 90c24d0..9ac93a3 100644 --- a/src/main/java/ch/digitalfondue/jfiveparse/ResizableCharBuilder.java +++ b/src/main/java/ch/digitalfondue/jfiveparse/ResizableCharBuilder.java @@ -103,5 +103,13 @@ char[] copyBackingCharArray() { return Arrays.copyOf(buff, pos); } + void append(char[] c, int offset, int length) { + if (pos + length >= buff.length) { + buff = Arrays.copyOf(buff, Math.max(pos + length, buff.length * 2 + 2)); + } + System.arraycopy(c, offset, buff, pos, length); + pos += length; + } + } From 2d05b7242464e944203c221dd386a184589e3d2b Mon Sep 17 00:00:00 2001 From: Sylvain Jermini Date: Tue, 19 May 2026 22:47:12 +0200 Subject: [PATCH 2/6] initial work for processing directly the character streams inside the input stream resource --- .../digitalfondue/jfiveparse/Characters.java | 3 + .../jfiveparse/ProcessedInputStream.java | 232 ++++++++++++++---- .../ProcessedInputStreamWithParseError.java | 8 - 3 files changed, 186 insertions(+), 57 deletions(-) diff --git a/src/main/java/ch/digitalfondue/jfiveparse/Characters.java b/src/main/java/ch/digitalfondue/jfiveparse/Characters.java index 512be84..f9eaf16 100644 --- a/src/main/java/ch/digitalfondue/jfiveparse/Characters.java +++ b/src/main/java/ch/digitalfondue/jfiveparse/Characters.java @@ -22,6 +22,7 @@ final class Characters { static final char NULL = 0x0000; static final char EXCLAMATION_MARK = 0x0021; + /** & */ static final char AMPERSAND = 0x0026; static final char LESSTHAN_SIGN = 0x003C; static final char GREATERTHAN_SIGN = 0x003E; @@ -31,7 +32,9 @@ final class Characters { static final int EOF = -1; static final char TAB = 0x0009; + /** \r */ static final char CR = 0x000D; + /** \n */ static final char LF = 0x000A; static final char FF = 0x000C; static final char SPACE = 0x0020; diff --git a/src/main/java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java b/src/main/java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java index a341982..480b094 100644 --- a/src/main/java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java +++ b/src/main/java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java @@ -28,70 +28,75 @@ abstract class ProcessedInputStream { protected abstract int read(); - static class StringProcessedInputStream extends ProcessedInputStream { - protected int pos = 0; - protected final char[] input; - - StringProcessedInputStream(String input) { - this.input = normalize(input); + int readUntil(ResizableCharBuilder builder, boolean stopAtAmpersand, boolean stopAtLessThan) { + int chr; + while (!buffer.isEmpty) { + chr = buffer.removeFirst(); + if ((stopAtAmpersand && chr == Characters.AMPERSAND) || (stopAtLessThan && chr == Characters.LESSTHAN_SIGN) || chr == Characters.NULL || chr == Characters.EOF) { + return chr; + } + builder.append((char) chr); } + return readUntilInternal(builder, stopAtAmpersand, stopAtLessThan); + } - private static char[] normalize(String s) { - char[] arr = s.toCharArray(); - int n = arr.length; - int j = 0; - for (int i = 0; i < n; i++) { - char c = arr[i]; - if (c == '\r') { - arr[j++] = '\n'; - if (i + 1 < n && arr[i + 1] == '\n') { - i++; - } - } else { - arr[j++] = c; - } + int readUntilAttributeValue(ResizableCharBuilder builder, int quoteChar, boolean stopAtAmpersand) { + int chr; + while (!buffer.isEmpty) { + chr = buffer.removeFirst(); + if (chr == quoteChar || (stopAtAmpersand && chr == Characters.AMPERSAND) || chr == Characters.NULL || chr == Characters.EOF) { + return chr; } - return j == n ? arr : Arrays.copyOf(arr, j); + builder.append((char) chr); } + return readUntilAttributeValueInternal(builder, quoteChar, stopAtAmpersand); + } - @Override - protected int read() { - if (pos < input.length) { - return input[pos++]; + int readUntilAttributeValueUnquoted(ResizableCharBuilder builder) { + int chr; + while (!buffer.isEmpty) { + chr = buffer.removeFirst(); + if (Common.isTabLfFfCrOrSpace(chr) || chr == Characters.AMPERSAND || chr == '>' || chr == Characters.NULL || + chr == '"' || chr == '\'' || chr == Characters.LESSTHAN_SIGN || chr == '=' || chr == '`' || chr == Characters.EOF) { + return chr; } - return -1; + builder.append((char) chr); } + return readUntilAttributeValueUnquotedInternal(builder); } - static final class ReaderProcessedInputStream extends ProcessedInputStream { - - private final Reader reader; - private boolean crFound; - - ReaderProcessedInputStream(Reader reader) { - this.reader = reader; + protected int readUntilInternal(ResizableCharBuilder builder, boolean stopAtAmpersand, boolean stopAtLessThan) { + int chr; + while ((chr = read()) != -1) { + if ((stopAtAmpersand && chr == Characters.AMPERSAND) || (stopAtLessThan && chr == Characters.LESSTHAN_SIGN) || chr == Characters.NULL) { + return chr; + } + builder.append((char) chr); } + return -1; + } - @Override - protected int read() { - try { - int chr = reader.read(); - if (crFound) { - crFound = false; - if (chr == Characters.LF) { - chr = reader.read(); - } - } + protected int readUntilAttributeValueInternal(ResizableCharBuilder builder, int quoteChar, boolean stopAtAmpersand) { + int chr; + while ((chr = read()) != -1) { + if (chr == quoteChar || (stopAtAmpersand && chr == Characters.AMPERSAND) || chr == Characters.NULL) { + return chr; + } + builder.append((char) chr); + } + return -1; + } - if (chr == Characters.CR) { - crFound = true; - chr = Characters.LF; - } + protected int readUntilAttributeValueUnquotedInternal(ResizableCharBuilder builder) { + int chr; + while ((chr = read()) != -1) { + if (Common.isTabLfFfCrOrSpace(chr) || chr == Characters.AMPERSAND || chr == '>' || chr == Characters.NULL || + chr == '"' || chr == '\'' || chr == Characters.LESSTHAN_SIGN || chr == '=' || chr == '`') { return chr; - } catch (IOException ioe) { - throw new ParserException(ioe); } + builder.append((char) chr); } + return -1; } // @@ -128,4 +133,133 @@ void consume(int count) { consume(); } } + + + + static class StringProcessedInputStream extends ProcessedInputStream { + private int pos = 0; + private final char[] input; + private final int length; + + + StringProcessedInputStream(String input) { + char[] toNormalize = input.toCharArray(); + int j = 0; + for (int i = 0; i < toNormalize.length; i++) { + char c = toNormalize[i]; + if (c == Characters.CR) { + toNormalize[j++] = Characters.LF; + if (i + 1 < toNormalize.length && toNormalize[i + 1] == Characters.LF) { + i++; + } + } else { + toNormalize[j++] = c; + } + } + this.input = toNormalize; + this.length = j; + } + + // used for test + protected int getCharAt(int pos) { + if (pos >= length) { + return -1; + } + return input[pos]; + } + + @Override + protected int read() { + if (pos < length) { + return input[pos++]; + } + return -1; + } + + @Override + protected int readUntilInternal(ResizableCharBuilder builder, boolean stopAtAmpersand, boolean stopAtLessThan) { + int n = length; + int i = pos; + while (i < n) { + char c = input[i]; + if ((stopAtAmpersand && c == Characters.AMPERSAND) || (stopAtLessThan && c == Characters.LESSTHAN_SIGN) || c == Characters.NULL) { + builder.append(input, pos, i - pos); + pos = i + 1; + return c; + } + i++; + } + builder.append(input, pos, n - pos); + pos = n; + return -1; + } + + @Override + protected int readUntilAttributeValueInternal(ResizableCharBuilder builder, int quoteChar, boolean stopAtAmpersand) { + int n = length; + int i = pos; + while (i < n) { + char c = input[i]; + if (c == quoteChar || (stopAtAmpersand && c == Characters.AMPERSAND) || c == Characters.NULL) { + builder.append(input, pos, i - pos); + pos = i + 1; + return c; + } + i++; + } + builder.append(input, pos, n - pos); + pos = n; + return -1; + } + + @Override + protected int readUntilAttributeValueUnquotedInternal(ResizableCharBuilder builder) { + int n = length; + int i = pos; + while (i < n) { + char c = input[i]; + if (Common.isTabLfFfCrOrSpace(c) || c == Characters.AMPERSAND || c == '>' || c == Characters.NULL || + c == '"' || c == '\'' || c == Characters.LESSTHAN_SIGN || c == '=' || c == '`') { + builder.append(input, pos, i - pos); + pos = i + 1; + return c; + } + i++; + } + builder.append(input, pos, n - pos); + pos = n; + return -1; + } + } + + static final class ReaderProcessedInputStream extends ProcessedInputStream { + + private final Reader reader; + private boolean crFound; + + ReaderProcessedInputStream(Reader reader) { + this.reader = reader; + } + + @Override + protected int read() { + try { + int chr = reader.read(); + if (crFound) { + crFound = false; + if (chr == Characters.LF) { + chr = reader.read(); + } + } + + if (chr == Characters.CR) { + crFound = true; + chr = Characters.LF; + } + return chr; + } catch (IOException ioe) { + throw new ParserException(ioe); + } + } + } } diff --git a/src/test/java/ch/digitalfondue/jfiveparse/ProcessedInputStreamWithParseError.java b/src/test/java/ch/digitalfondue/jfiveparse/ProcessedInputStreamWithParseError.java index cd5c8c5..1cd2459 100644 --- a/src/test/java/ch/digitalfondue/jfiveparse/ProcessedInputStreamWithParseError.java +++ b/src/test/java/ch/digitalfondue/jfiveparse/ProcessedInputStreamWithParseError.java @@ -45,14 +45,6 @@ int consume() { return chr; } - private int getCharAt(int position) { - try { - return input[position]; - } catch (IndexOutOfBoundsException s) { - return -1; - } - } - private int getCurrentInputCharacter() { if (buffer.length() > 0) { return buffer.getCharAt(0); From 8d1449ed5bf8f2d6d40c1b51d1cccaab7697bfea Mon Sep 17 00:00:00 2001 From: Sylvain Jermini Date: Wed, 20 May 2026 22:25:48 +0200 Subject: [PATCH 3/6] refactor/add bypass for text node/attribute node --- .../digitalfondue/jfiveparse/Tokenizer.java | 2 +- .../jfiveparse/TokenizerState.java | 252 +++++++++--------- 2 files changed, 128 insertions(+), 126 deletions(-) diff --git a/src/main/java/ch/digitalfondue/jfiveparse/Tokenizer.java b/src/main/java/ch/digitalfondue/jfiveparse/Tokenizer.java index 563b3eb..8790d55 100644 --- a/src/main/java/ch/digitalfondue/jfiveparse/Tokenizer.java +++ b/src/main/java/ch/digitalfondue/jfiveparse/Tokenizer.java @@ -37,7 +37,7 @@ final class Tokenizer { // tag related private Attributes attributes; private final ResizableCharBuilder currentAttributeName = new ResizableCharBuilder(); - private ResizableCharBuilder currentAttributeValue; + ResizableCharBuilder currentAttributeValue; private int currentAttributeQuoteType; private boolean selfClosing; final ResizableCharBuilder tagName = new ResizableCharBuilder(); diff --git a/src/main/java/ch/digitalfondue/jfiveparse/TokenizerState.java b/src/main/java/ch/digitalfondue/jfiveparse/TokenizerState.java index 6fdbdab..6efe6f8 100644 --- a/src/main/java/ch/digitalfondue/jfiveparse/TokenizerState.java +++ b/src/main/java/ch/digitalfondue/jfiveparse/TokenizerState.java @@ -344,24 +344,21 @@ static void handleScriptDataState(Tokenizer tokenizer, ProcessedInputStream proc // optimization: bypass if possible if (tokenizer.getState() == SCRIPT_DATA_STATE && previousInsertionMode == currentInsertionMode && textNode != null) { - for (;;) { - int internalChr = processedInputStream.getNextInputCharacterAndConsume(); - switch (internalChr) { - case Characters.EOF: - tokenizer.resetTokenHandlerInsertCharacterPreviousTextNode(); - tokenizer.emitEOF(); - return; - case Characters.NULL: - tokenizer.emitParseError(); - tokenizer.emitCharacter(Characters.REPLACEMENT_CHARACTER); - return; - case Characters.LESSTHAN_SIGN: - tokenizer.setState(SCRIPT_DATA_LESS_THAN_SIGN_STATE); - return; - default: - textNode.append((char) internalChr); - break; - } + int internalChr = processedInputStream.readUntil(textNode, false, true); + switch (internalChr) { + case Characters.EOF: + tokenizer.resetTokenHandlerInsertCharacterPreviousTextNode(); + tokenizer.emitEOF(); + return; + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.emitCharacter(Characters.REPLACEMENT_CHARACTER); + return; + case Characters.LESSTHAN_SIGN: + tokenizer.setState(SCRIPT_DATA_LESS_THAN_SIGN_STATE); + return; + default: + break; } } break; @@ -800,7 +797,24 @@ static void handlePlainTextState(Tokenizer tokenizer, ProcessedInputStream proce tokenizer.emitEOF(); break; default: + int previousInsertionMode = tokenizer.getTokenHandlerInsertionMode(); tokenizer.emitCharacter(chr); + int currentInsertionMode = tokenizer.getTokenHandlerInsertionMode(); + ResizableCharBuilder textNode = tokenizer.getTokenHandlerInsertCharacterPreviousTextNode(); + if (tokenizer.getState() == PLAINTEXT_STATE && previousInsertionMode == currentInsertionMode && textNode != null) { + int internalChr = processedInputStream.readUntil(textNode, false, false); + switch (internalChr) { + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.emitCharacter(Characters.REPLACEMENT_CHARACTER); + return; + case Characters.EOF: + tokenizer.emitEOF(); + return; + default: + break; + } + } break; } } @@ -827,25 +841,20 @@ static void handleRawtextState(Tokenizer tokenizer, ProcessedInputStream process int currentInsertionMode = tokenizer.getTokenHandlerInsertionMode(); ResizableCharBuilder textNode = tokenizer.getTokenHandlerInsertCharacterPreviousTextNode(); if (tokenizer.getState() == RAWTEXT_STATE && previousInsertionMode == currentInsertionMode && textNode != null) { - - for (;;) { - int internalChr = processedInputStream.getNextInputCharacterAndConsume(); - switch (internalChr) { - case Characters.LESSTHAN_SIGN: - tokenizer.resetTokenHandlerInsertCharacterPreviousTextNode(); - tokenizer.setState(RAWTEXT_LESS_THAN_SIGN_STATE); - return; - case Characters.NULL: - tokenizer.emitParseError(); - tokenizer.emitCharacter(Characters.REPLACEMENT_CHARACTER); - return; - case Characters.EOF: - tokenizer.resetTokenHandlerInsertCharacterPreviousTextNode(); - tokenizer.emitEOF(); - return; - default: - textNode.append((char) internalChr); - } + int internalChr = processedInputStream.readUntil(textNode, false, true); + switch (internalChr) { + case Characters.LESSTHAN_SIGN: + tokenizer.setState(RAWTEXT_LESS_THAN_SIGN_STATE); + return; + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.emitCharacter(Characters.REPLACEMENT_CHARACTER); + return; + case Characters.EOF: + tokenizer.emitEOF(); + return; + default: + break; } } break; @@ -2243,100 +2252,93 @@ static void handleBeforeAttributeValueState(Tokenizer tokenizer, ProcessedInputS static void handleAttributeValueDoubleQuotedState(Tokenizer tokenizer, ProcessedInputStream processedInputStream) { // vvv optimization vvv, we try to append as much as possible - do { - int chr = processedInputStream.getNextInputCharacterAndConsume(); - switch (chr) { - case Characters.QUOTATION_MARK: - tokenizer.setState(AFTER_ATTRIBUTE_VALUE_QUOTED_STATE); - return; - case Characters.AMPERSAND: - // save current state - tokenizer.setPreviousState(ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE); - // - tokenizer.setState(CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE); - tokenizer.additionalAllowedCharacter = Characters.QUOTATION_MARK; - return; - case Characters.NULL: - tokenizer.emitParseError(); - tokenizer.appendCurrentAttributeValue(Characters.REPLACEMENT_CHARACTER); - return; - case Characters.EOF: - tokenizer.emitParseErrorAndSetState(DATA_STATE); - processedInputStream.reconsume(chr); - return; - default: - tokenizer.appendCurrentAttributeValue(chr); - } - } while (true); + + int chr = processedInputStream.readUntilAttributeValue(tokenizer.currentAttributeValue, Characters.QUOTATION_MARK, true); + switch (chr) { + case Characters.QUOTATION_MARK: + tokenizer.setState(AFTER_ATTRIBUTE_VALUE_QUOTED_STATE); + return; + case Characters.AMPERSAND: + // save current state + tokenizer.setPreviousState(ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE); + // + tokenizer.setState(CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE); + tokenizer.additionalAllowedCharacter = Characters.QUOTATION_MARK; + return; + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.appendCurrentAttributeValue(Characters.REPLACEMENT_CHARACTER); + return; + case Characters.EOF: + tokenizer.emitParseErrorAndSetState(DATA_STATE); + processedInputStream.reconsume(chr); + return; + } } static void handleAttributeValueSingleQuotedState(Tokenizer tokenizer, ProcessedInputStream processedInputStream) { - do { - int chr = processedInputStream.getNextInputCharacterAndConsume(); - switch (chr) { - case Characters.APOSTROPHE: - tokenizer.setState(AFTER_ATTRIBUTE_VALUE_QUOTED_STATE); - return; - case Characters.AMPERSAND: - // save current state - tokenizer.setPreviousState(ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE); - // - tokenizer.setState(CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE); - tokenizer.additionalAllowedCharacter = Characters.APOSTROPHE; - return; - case Characters.NULL: - tokenizer.emitParseError(); - tokenizer.appendCurrentAttributeValue(Characters.REPLACEMENT_CHARACTER); - return; - case Characters.EOF: - tokenizer.emitParseErrorAndSetState(DATA_STATE); - processedInputStream.reconsume(chr); - return; - default: - tokenizer.appendCurrentAttributeValue(chr); - } - } while (true); + int chr = processedInputStream.readUntilAttributeValue(tokenizer.currentAttributeValue, Characters.APOSTROPHE, true); + switch (chr) { + case Characters.APOSTROPHE: + tokenizer.setState(AFTER_ATTRIBUTE_VALUE_QUOTED_STATE); + return; + case Characters.AMPERSAND: + // save current state + tokenizer.setPreviousState(ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE); + // + tokenizer.setState(CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE); + tokenizer.additionalAllowedCharacter = Characters.APOSTROPHE; + return; + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.appendCurrentAttributeValue(Characters.REPLACEMENT_CHARACTER); + return; + case Characters.EOF: + tokenizer.emitParseErrorAndSetState(DATA_STATE); + processedInputStream.reconsume(chr); + return; + default: + break; + } } static void handleAttributeValueUnquotedState(Tokenizer tokenizer, ProcessedInputStream processedInputStream) { - do { - int chr = processedInputStream.getNextInputCharacterAndConsume(); - switch (chr) { - case Characters.TAB: - case Characters.LF: - case Characters.FF: - case Characters.SPACE: - tokenizer.setState(BEFORE_ATTRIBUTE_NAME_STATE); - return; - case Characters.AMPERSAND: - tokenizer.setPreviousState(ATTRIBUTE_VALUE_UNQUOTED_STATE); - tokenizer.setState(CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE); - tokenizer.additionalAllowedCharacter = Characters.GREATERTHAN_SIGN; - return; - case Characters.GREATERTHAN_SIGN: - tokenizer.setState(DATA_STATE); - tokenizer.addCurrentAttributeAndEmitToken(); - return; - case Characters.NULL: - tokenizer.emitParseError(); - tokenizer.appendCurrentAttributeValue(Characters.REPLACEMENT_CHARACTER); - return; - case Characters.QUOTATION_MARK: - case Characters.APOSTROPHE: - case Characters.LESSTHAN_SIGN: - case Characters.EQUALS_SIGN: - case Characters.GRAVE_ACCENT: - tokenizer.emitParseError(); - tokenizer.appendCurrentAttributeValue(chr); - return; - case Characters.EOF: - tokenizer.emitParseErrorAndSetState(DATA_STATE); - processedInputStream.reconsume(chr); - return; - default: - tokenizer.appendCurrentAttributeValue(chr); - } - } while (true); + int chr = processedInputStream.readUntilAttributeValueUnquoted(tokenizer.currentAttributeValue); + switch (chr) { + case Characters.TAB: + case Characters.LF: + case Characters.FF: + case Characters.SPACE: + tokenizer.setState(BEFORE_ATTRIBUTE_NAME_STATE); + return; + case Characters.AMPERSAND: + tokenizer.setPreviousState(ATTRIBUTE_VALUE_UNQUOTED_STATE); + tokenizer.setState(CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE); + tokenizer.additionalAllowedCharacter = Characters.GREATERTHAN_SIGN; + return; + case Characters.GREATERTHAN_SIGN: + tokenizer.setState(DATA_STATE); + tokenizer.addCurrentAttributeAndEmitToken(); + return; + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.appendCurrentAttributeValue(Characters.REPLACEMENT_CHARACTER); + return; + case Characters.QUOTATION_MARK: + case Characters.APOSTROPHE: + case Characters.LESSTHAN_SIGN: + case Characters.EQUALS_SIGN: + case Characters.GRAVE_ACCENT: + tokenizer.emitParseError(); + tokenizer.appendCurrentAttributeValue(chr); + return; + case Characters.EOF: + tokenizer.emitParseErrorAndSetState(DATA_STATE); + processedInputStream.reconsume(chr); + return; + default: + break; + } } static void handleCharacterReferenceInAttributeValueState(Tokenizer tokenizer, ProcessedInputStream processedInputStream) { From dacf01446f8c5d4be2ad9666e5405610bbcf6ebc Mon Sep 17 00:00:00 2001 From: Sylvain Jermini Date: Sun, 24 May 2026 15:08:39 +0200 Subject: [PATCH 4/6] Optim input tmp (#63) * tmp * refactor --- .../digitalfondue/jfiveparse/Characters.java | 2 + .../jfiveparse/ProcessedInputStream.java | 166 +++++++++++-- .../digitalfondue/jfiveparse/Tokenizer.java | 4 +- .../jfiveparse/TokenizerState.java | 232 +++++++++--------- 4 files changed, 274 insertions(+), 130 deletions(-) diff --git a/src/main/java/ch/digitalfondue/jfiveparse/Characters.java b/src/main/java/ch/digitalfondue/jfiveparse/Characters.java index f9eaf16..d7380bd 100644 --- a/src/main/java/ch/digitalfondue/jfiveparse/Characters.java +++ b/src/main/java/ch/digitalfondue/jfiveparse/Characters.java @@ -24,7 +24,9 @@ final class Characters { static final char EXCLAMATION_MARK = 0x0021; /** & */ static final char AMPERSAND = 0x0026; + /** < */ static final char LESSTHAN_SIGN = 0x003C; + /** > */ static final char GREATERTHAN_SIGN = 0x003E; static final char SOLIDUS = 0x002F; static final char QUESTION_MARK = 0x003F; diff --git a/src/main/java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java b/src/main/java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java index 480b094..50c86fa 100644 --- a/src/main/java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java +++ b/src/main/java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java @@ -52,12 +52,18 @@ int readUntilAttributeValue(ResizableCharBuilder builder, int quoteChar, boolean return readUntilAttributeValueInternal(builder, quoteChar, stopAtAmpersand); } + private static boolean mustStopReadUntilAttributeValueUnquoted(int chr) { + return Common.isTabLfFfCrOrSpace(chr) || chr == Characters.AMPERSAND || chr == Characters.GREATERTHAN_SIGN + || chr == Characters.NULL || chr == Characters.QUOTATION_MARK || + chr == Characters.APOSTROPHE || chr == Characters.LESSTHAN_SIGN || + chr == Characters.EQUALS_SIGN || chr == Characters.GRAVE_ACCENT || chr == Characters.EOF; + } + int readUntilAttributeValueUnquoted(ResizableCharBuilder builder) { int chr; while (!buffer.isEmpty) { chr = buffer.removeFirst(); - if (Common.isTabLfFfCrOrSpace(chr) || chr == Characters.AMPERSAND || chr == '>' || chr == Characters.NULL || - chr == '"' || chr == '\'' || chr == Characters.LESSTHAN_SIGN || chr == '=' || chr == '`' || chr == Characters.EOF) { + if (mustStopReadUntilAttributeValueUnquoted(chr)) { return chr; } builder.append((char) chr); @@ -65,38 +71,111 @@ int readUntilAttributeValueUnquoted(ResizableCharBuilder builder) { return readUntilAttributeValueUnquotedInternal(builder); } + int readUntilTagName(ResizableCharBuilder builder) { + int chr; + while (!buffer.isEmpty) { + chr = buffer.removeFirst(); + if (Common.isTabLfFfCrOrSpace(chr) || chr == Characters.SOLIDUS || chr == Characters.GREATERTHAN_SIGN || chr == Characters.NULL || chr == Characters.EOF) { + return chr; + } + builder.append((char) chr); + } + return readUntilTagNameInternal(builder); + } + + private static boolean mustStopReadUntilAttributeName(int chr) { + return Common.isTabLfFfCrOrSpace(chr) || chr == Characters.SOLIDUS || chr == Characters.EQUALS_SIGN || chr == Characters.GREATERTHAN_SIGN || chr == Characters.NULL || + chr == Characters.QUOTATION_MARK || chr == Characters.APOSTROPHE || chr == Characters.LESSTHAN_SIGN || chr == Characters.EOF; + } + + int readUntilAttributeName(ResizableCharBuilder builder) { + int chr; + while (!buffer.isEmpty) { + chr = buffer.removeFirst(); + if (mustStopReadUntilAttributeName(chr)) { + return chr; + } + builder.append((char) chr); + } + return readUntilAttributeNameInternal(builder); + } + + int readUntilComment(ResizableCharBuilder builder) { + int chr; + while (!buffer.isEmpty) { + chr = buffer.removeFirst(); + if (chr == Characters.HYPHEN_MINUS || chr == Characters.NULL || chr == Characters.EOF) { + return chr; + } + builder.append((char) chr); + } + return readUntilCommentInternal(builder); + } + protected int readUntilInternal(ResizableCharBuilder builder, boolean stopAtAmpersand, boolean stopAtLessThan) { int chr; - while ((chr = read()) != -1) { + while ((chr = read()) != Characters.EOF) { if ((stopAtAmpersand && chr == Characters.AMPERSAND) || (stopAtLessThan && chr == Characters.LESSTHAN_SIGN) || chr == Characters.NULL) { return chr; } builder.append((char) chr); } - return -1; + return Characters.EOF; } protected int readUntilAttributeValueInternal(ResizableCharBuilder builder, int quoteChar, boolean stopAtAmpersand) { int chr; - while ((chr = read()) != -1) { + while ((chr = read()) != Characters.EOF) { if (chr == quoteChar || (stopAtAmpersand && chr == Characters.AMPERSAND) || chr == Characters.NULL) { return chr; } builder.append((char) chr); } - return -1; + return Characters.EOF; } protected int readUntilAttributeValueUnquotedInternal(ResizableCharBuilder builder) { int chr; - while ((chr = read()) != -1) { - if (Common.isTabLfFfCrOrSpace(chr) || chr == Characters.AMPERSAND || chr == '>' || chr == Characters.NULL || - chr == '"' || chr == '\'' || chr == Characters.LESSTHAN_SIGN || chr == '=' || chr == '`') { + while ((chr = read()) != Characters.EOF) { + if (mustStopReadUntilAttributeValueUnquoted(chr)) { + return chr; + } + builder.append((char) chr); + } + return Characters.EOF; + } + + protected int readUntilTagNameInternal(ResizableCharBuilder builder) { + int chr; + while ((chr = read()) != Characters.EOF) { + if (Common.isTabLfFfCrOrSpace(chr) || chr == Characters.SOLIDUS || chr == Characters.GREATERTHAN_SIGN || chr == Characters.NULL) { + return chr; + } + builder.append((char) chr); + } + return Characters.EOF; + } + + protected int readUntilAttributeNameInternal(ResizableCharBuilder builder) { + int chr; + while ((chr = read()) != Characters.EOF) { + if (mustStopReadUntilAttributeName(chr)) { + return chr; + } + builder.append((char) chr); + } + return Characters.EOF; + } + + protected int readUntilCommentInternal(ResizableCharBuilder builder) { + int chr; + while ((chr = read()) != Characters.EOF) { + if (chr == Characters.HYPHEN_MINUS || chr == Characters.NULL) { return chr; } builder.append((char) chr); } - return -1; + return Characters.EOF; } // @@ -163,7 +242,7 @@ static class StringProcessedInputStream extends ProcessedInputStream { // used for test protected int getCharAt(int pos) { if (pos >= length) { - return -1; + return Characters.EOF; } return input[pos]; } @@ -173,7 +252,7 @@ protected int read() { if (pos < length) { return input[pos++]; } - return -1; + return Characters.EOF; } @Override @@ -191,7 +270,7 @@ protected int readUntilInternal(ResizableCharBuilder builder, boolean stopAtAmpe } builder.append(input, pos, n - pos); pos = n; - return -1; + return Characters.EOF; } @Override @@ -209,7 +288,7 @@ protected int readUntilAttributeValueInternal(ResizableCharBuilder builder, int } builder.append(input, pos, n - pos); pos = n; - return -1; + return Characters.EOF; } @Override @@ -218,8 +297,61 @@ protected int readUntilAttributeValueUnquotedInternal(ResizableCharBuilder build int i = pos; while (i < n) { char c = input[i]; - if (Common.isTabLfFfCrOrSpace(c) || c == Characters.AMPERSAND || c == '>' || c == Characters.NULL || - c == '"' || c == '\'' || c == Characters.LESSTHAN_SIGN || c == '=' || c == '`') { + if (mustStopReadUntilAttributeValueUnquoted(c)) { + builder.append(input, pos, i - pos); + pos = i + 1; + return c; + } + i++; + } + builder.append(input, pos, n - pos); + pos = n; + return Characters.EOF; + } + + @Override + protected int readUntilTagNameInternal(ResizableCharBuilder builder) { + int n = length; + int i = pos; + while (i < n) { + char c = input[i]; + if (Common.isTabLfFfCrOrSpace(c) || c == Characters.SOLIDUS || c == Characters.GREATERTHAN_SIGN || c == Characters.NULL) { + builder.append(input, pos, i - pos); + pos = i + 1; + return c; + } + i++; + } + builder.append(input, pos, n - pos); + pos = n; + return Characters.EOF; + } + + @Override + protected int readUntilAttributeNameInternal(ResizableCharBuilder builder) { + int n = length; + int i = pos; + while (i < n) { + char c = input[i]; + if (mustStopReadUntilAttributeName(c)) { + builder.append(input, pos, i - pos); // append remaining + pos = i + 1; + return c; + } + i++; + } + builder.append(input, pos, n - pos); + pos = n; + return Characters.EOF; + } + + @Override + protected int readUntilCommentInternal(ResizableCharBuilder builder) { + int n = length; + int i = pos; + while (i < n) { + char c = input[i]; + if (c == '-' || c == Characters.NULL) { builder.append(input, pos, i - pos); pos = i + 1; return c; @@ -228,7 +360,7 @@ protected int readUntilAttributeValueUnquotedInternal(ResizableCharBuilder build } builder.append(input, pos, n - pos); pos = n; - return -1; + return Characters.EOF; } } diff --git a/src/main/java/ch/digitalfondue/jfiveparse/Tokenizer.java b/src/main/java/ch/digitalfondue/jfiveparse/Tokenizer.java index 8790d55..0141829 100644 --- a/src/main/java/ch/digitalfondue/jfiveparse/Tokenizer.java +++ b/src/main/java/ch/digitalfondue/jfiveparse/Tokenizer.java @@ -36,7 +36,7 @@ final class Tokenizer { // tag related private Attributes attributes; - private final ResizableCharBuilder currentAttributeName = new ResizableCharBuilder(); + final ResizableCharBuilder currentAttributeName = new ResizableCharBuilder(); ResizableCharBuilder currentAttributeValue; private int currentAttributeQuoteType; private boolean selfClosing; @@ -51,7 +51,7 @@ final class Tokenizer { private StringBuilder doctypeSystemIdentifier; // comment related - private ResizableCharBuilder commentToken; + ResizableCharBuilder commentToken; // private final ResizableCharBuilder temporaryBuffer = new ResizableCharBuilder(); diff --git a/src/main/java/ch/digitalfondue/jfiveparse/TokenizerState.java b/src/main/java/ch/digitalfondue/jfiveparse/TokenizerState.java index 6efe6f8..cf20553 100644 --- a/src/main/java/ch/digitalfondue/jfiveparse/TokenizerState.java +++ b/src/main/java/ch/digitalfondue/jfiveparse/TokenizerState.java @@ -159,36 +159,32 @@ static void handleEndTagOpenState(Tokenizer tokenizer, ProcessedInputStream proc } static void handleTagNameState(Tokenizer tokenizer, ProcessedInputStream processedInputStream) { - // bypass and optimization, as we are accumulating the tag name, we can do it here - // in a single loop, avoiding method calls - do { - int chr = processedInputStream.getNextInputCharacterAndConsume(); - switch (chr) { - case Characters.TAB: - case Characters.LF: - case Characters.FF: - case Characters.SPACE: - tokenizer.setState(BEFORE_ATTRIBUTE_NAME_STATE); - return; - case Characters.SOLIDUS: - tokenizer.setState(SELF_CLOSING_START_TAG_STATE); - return; - case Characters.GREATERTHAN_SIGN: - tokenizer.setState(DATA_STATE); - tokenizer.addCurrentAttributeAndEmitToken(); - return; - case Characters.NULL: - tokenizer.emitParseError(); - tokenizer.appendCurrentTagToken(Characters.REPLACEMENT_CHARACTER); - return; - case Characters.EOF: - tokenizer.emitParseErrorAndSetState(DATA_STATE); - processedInputStream.reconsume(chr); - return; - default: - tokenizer.tagName.append((char) chr); - } - } while (true); + int chr = processedInputStream.readUntilTagName(tokenizer.tagName); + switch (chr) { + case Characters.TAB: + case Characters.LF: + case Characters.FF: + case Characters.SPACE: + tokenizer.setState(BEFORE_ATTRIBUTE_NAME_STATE); + return; + case Characters.SOLIDUS: + tokenizer.setState(SELF_CLOSING_START_TAG_STATE); + return; + case Characters.GREATERTHAN_SIGN: + tokenizer.setState(DATA_STATE); + tokenizer.addCurrentAttributeAndEmitToken(); + return; + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.appendCurrentTagToken(Characters.REPLACEMENT_CHARACTER); + return; + case Characters.EOF: + tokenizer.emitParseErrorAndSetState(DATA_STATE); + processedInputStream.reconsume(chr); + return; + default: + break; + } } static void handleSelfClosingStartTagState(Tokenizer tokenizer, ProcessedInputStream processedInputStream) { @@ -229,7 +225,30 @@ static void handleRCDataState(Tokenizer tokenizer, ProcessedInputStream processe tokenizer.emitEOF(); // does nothing break; default: + int previousInsertionMode = tokenizer.getTokenHandlerInsertionMode(); tokenizer.emitCharacter(chr); + int currentInsertionMode = tokenizer.getTokenHandlerInsertionMode(); + ResizableCharBuilder textNode = tokenizer.getTokenHandlerInsertCharacterPreviousTextNode(); + if (tokenizer.getState() == RCDATA_STATE && previousInsertionMode == currentInsertionMode && textNode != null) { + int internalChr = processedInputStream.readUntil(textNode, true, true); + switch (internalChr) { + case Characters.AMPERSAND: + tokenizer.setState(CHARACTER_REFERENCE_IN_RCDATA_STATE); + return; + case Characters.LESSTHAN_SIGN: + tokenizer.setState(RCDATA_LESS_THAN_SIGN_STATE); + return; + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.emitCharacter(Characters.REPLACEMENT_CHARACTER); + return; + case Characters.EOF: + tokenizer.emitEOF(); + return; + default: + break; + } + } break; } } @@ -1552,25 +1571,23 @@ static void handleCommentStartDashState(Tokenizer tokenizer, ProcessedInputStrea } static void handleCommentState(Tokenizer tokenizer, ProcessedInputStream processedInputStream) { - do { - int chr = processedInputStream.getNextInputCharacterAndConsume(); - switch (chr) { - case Characters.HYPHEN_MINUS: - tokenizer.setState(COMMENT_END_DASH_STATE); - return; - case Characters.NULL: - tokenizer.emitParseError(); - tokenizer.appendCommentCharacter(Characters.REPLACEMENT_CHARACTER); - return; - case Characters.EOF: - tokenizer.emitParseErrorAndSetState(DATA_STATE); - tokenizer.emitComment(); - processedInputStream.reconsume(chr); - return; - default: - tokenizer.appendCommentCharacter(chr); - } - } while (true); + int chr = processedInputStream.readUntilComment(tokenizer.commentToken); + switch (chr) { + case Characters.HYPHEN_MINUS: + tokenizer.setState(COMMENT_END_DASH_STATE); + return; + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.appendCommentCharacter(Characters.REPLACEMENT_CHARACTER); + return; + case Characters.EOF: + tokenizer.emitParseErrorAndSetState(DATA_STATE); + tokenizer.emitComment(); + processedInputStream.reconsume(chr); + return; + default: + break; + } } static void handleCommentEndDashState(Tokenizer tokenizer, ProcessedInputStream processedInputStream) { @@ -2006,29 +2023,26 @@ static void handleDataState(Tokenizer tokenizer, ProcessedInputStream processedI && (currentInsertionMode == TreeConstructor.IM_IN_BODY || currentInsertionMode == TreeConstructor.IM_IN_CELL) && tokenizer.isTokenHandlerInHtmlContent() && textNode != null) { - for (;;) { - int internalChr = processedInputStream.getNextInputCharacterAndConsume(); - switch (internalChr) { - case Characters.EOF: - tokenizer.resetTokenHandlerInsertCharacterPreviousTextNode(); - tokenizer.emitEOF(); - return; - case Characters.NULL: - tokenizer.emitParseError(); - tokenizer.emitCharacter(internalChr); - return; - case Characters.AMPERSAND: - tokenizer.resetTokenHandlerInsertCharacterPreviousTextNode(); - tokenizer.setState(CHARACTER_REFERENCE_IN_DATA_STATE); - return; - case Characters.LESSTHAN_SIGN: - tokenizer.resetTokenHandlerInsertCharacterPreviousTextNode(); - tokenizer.setState(TAG_OPEN_STATE); - return; - default: - textNode.append((char) internalChr); - break; - } + int internalChr = processedInputStream.readUntil(textNode, true, true); + switch (internalChr) { + case Characters.EOF: + tokenizer.resetTokenHandlerInsertCharacterPreviousTextNode(); + tokenizer.emitEOF(); + return; + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.emitCharacter(internalChr); + return; + case Characters.AMPERSAND: + tokenizer.resetTokenHandlerInsertCharacterPreviousTextNode(); + tokenizer.setState(CHARACTER_REFERENCE_IN_DATA_STATE); + return; + case Characters.LESSTHAN_SIGN: + tokenizer.resetTokenHandlerInsertCharacterPreviousTextNode(); + tokenizer.setState(TAG_OPEN_STATE); + return; + default: + break; } } @@ -2114,45 +2128,41 @@ static void handleBeforeAttributeNameState(Tokenizer tokenizer, ProcessedInputSt } static void handleAttributeNameState(Tokenizer tokenizer, ProcessedInputStream processedInputStream) { - // vvv optimization vvv, we try to bypass as much as possible for the case "appendCurrentAttributeName" - do { - int chr = processedInputStream.getNextInputCharacterAndConsume(); - switch (chr) { - case Characters.TAB: - case Characters.LF: - case Characters.FF: - case Characters.SPACE: - tokenizer.setState(AFTER_ATTRIBUTE_NAME_STATE); - return; - case Characters.SOLIDUS: - tokenizer.setState(SELF_CLOSING_START_TAG_STATE); - return; - case Characters.EQUALS_SIGN: - tokenizer.setState(BEFORE_ATTRIBUTE_VALUE_STATE); - return; - case Characters.GREATERTHAN_SIGN: - tokenizer.setState(DATA_STATE); - tokenizer.addCurrentAttributeAndEmitToken(); - return; - case Characters.NULL: - tokenizer.emitParseError(); - tokenizer.appendCurrentAttributeName(Characters.REPLACEMENT_CHARACTER); - return; - case Characters.QUOTATION_MARK: - case Characters.APOSTROPHE: - case Characters.LESSTHAN_SIGN: - tokenizer.emitParseError(); - tokenizer.appendCurrentAttributeName(chr); - return; - case Characters.EOF: - tokenizer.emitParseErrorAndSetState(DATA_STATE); - processedInputStream.reconsume(chr); - return; - default: - tokenizer.appendCurrentAttributeName(chr); - break; - } - } while (true); + int chr = processedInputStream.readUntilAttributeName(tokenizer.currentAttributeName); + switch (chr) { + case Characters.TAB: + case Characters.LF: + case Characters.FF: + case Characters.SPACE: + tokenizer.setState(AFTER_ATTRIBUTE_NAME_STATE); + return; + case Characters.SOLIDUS: + tokenizer.setState(SELF_CLOSING_START_TAG_STATE); + return; + case Characters.EQUALS_SIGN: + tokenizer.setState(BEFORE_ATTRIBUTE_VALUE_STATE); + return; + case Characters.GREATERTHAN_SIGN: + tokenizer.setState(DATA_STATE); + tokenizer.addCurrentAttributeAndEmitToken(); + return; + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.appendCurrentAttributeName(Characters.REPLACEMENT_CHARACTER); + return; + case Characters.QUOTATION_MARK: + case Characters.APOSTROPHE: + case Characters.LESSTHAN_SIGN: + tokenizer.emitParseError(); + tokenizer.appendCurrentAttributeName(chr); + return; + case Characters.EOF: + tokenizer.emitParseErrorAndSetState(DATA_STATE); + processedInputStream.reconsume(chr); + return; + default: + break; + } } static void handleAfterAttributeNameState(Tokenizer tokenizer, ProcessedInputStream processedInputStream) { From 5c93b77e881bba6f1e46edc1a6c076c5a4e8dac2 Mon Sep 17 00:00:00 2001 From: Sylvain Jermini Date: Sun, 24 May 2026 16:21:03 +0200 Subject: [PATCH 5/6] misc cleanup --- src/main/java/ch/digitalfondue/jfiveparse/Attributes.java | 2 +- .../ch/digitalfondue/jfiveparse/ResizableCharBuilder.java | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/main/java/ch/digitalfondue/jfiveparse/Attributes.java b/src/main/java/ch/digitalfondue/jfiveparse/Attributes.java index 0d5b13e..391c971 100644 --- a/src/main/java/ch/digitalfondue/jfiveparse/Attributes.java +++ b/src/main/java/ch/digitalfondue/jfiveparse/Attributes.java @@ -70,7 +70,7 @@ Set keySet() { private void ensureMap() { if (attributes == null) { - attributes = new LinkedHashMap<>(); + attributes = new LinkedHashMap<>(8); } } diff --git a/src/main/java/ch/digitalfondue/jfiveparse/ResizableCharBuilder.java b/src/main/java/ch/digitalfondue/jfiveparse/ResizableCharBuilder.java index 9ac93a3..5873407 100644 --- a/src/main/java/ch/digitalfondue/jfiveparse/ResizableCharBuilder.java +++ b/src/main/java/ch/digitalfondue/jfiveparse/ResizableCharBuilder.java @@ -53,12 +53,10 @@ int pos() { } void append(char c) { - try { - buff[pos++] = c; - } catch (IndexOutOfBoundsException e) { + if (pos == buff.length) { buff = Arrays.copyOf(buff, buff.length * 2 + 2); - buff[pos - 1] = c; } + buff[pos++] = c; } String toLowerCase() { From 317f1ee62136f525e56449013ad3b614a875a3d4 Mon Sep 17 00:00:00 2001 From: Sylvain Jermini Date: Sun, 24 May 2026 18:52:38 +0200 Subject: [PATCH 6/6] unused import --- src/main/java/ch/digitalfondue/jfiveparse/EntitiesPrefix.java | 1 - .../java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/main/java/ch/digitalfondue/jfiveparse/EntitiesPrefix.java b/src/main/java/ch/digitalfondue/jfiveparse/EntitiesPrefix.java index e977336..696c140 100644 --- a/src/main/java/ch/digitalfondue/jfiveparse/EntitiesPrefix.java +++ b/src/main/java/ch/digitalfondue/jfiveparse/EntitiesPrefix.java @@ -19,7 +19,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.Map; import java.util.TreeMap; import java.util.zip.GZIPInputStream; diff --git a/src/main/java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java b/src/main/java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java index 50c86fa..2c07b72 100644 --- a/src/main/java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java +++ b/src/main/java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java @@ -17,7 +17,6 @@ import java.io.IOException; import java.io.Reader; -import java.util.Arrays; /** * Wrapped and abstracted input. Can most likely be optimized. @@ -351,7 +350,7 @@ protected int readUntilCommentInternal(ResizableCharBuilder builder) { int i = pos; while (i < n) { char c = input[i]; - if (c == '-' || c == Characters.NULL) { + if (c == Characters.HYPHEN_MINUS || c == Characters.NULL) { builder.append(input, pos, i - pos); pos = i + 1; return c;