diff --git a/src/main/java/ch/digitalfondue/jfiveparse/Attributes.java b/src/main/java/ch/digitalfondue/jfiveparse/Attributes.java index 0d5b13e..391c971 100644 --- a/src/main/java/ch/digitalfondue/jfiveparse/Attributes.java +++ b/src/main/java/ch/digitalfondue/jfiveparse/Attributes.java @@ -70,7 +70,7 @@ Set keySet() { private void ensureMap() { if (attributes == null) { - attributes = new LinkedHashMap<>(); + attributes = new LinkedHashMap<>(8); } } diff --git a/src/main/java/ch/digitalfondue/jfiveparse/Characters.java b/src/main/java/ch/digitalfondue/jfiveparse/Characters.java index 512be84..d7380bd 100644 --- a/src/main/java/ch/digitalfondue/jfiveparse/Characters.java +++ b/src/main/java/ch/digitalfondue/jfiveparse/Characters.java @@ -22,8 +22,11 @@ final class Characters { static final char NULL = 0x0000; static final char EXCLAMATION_MARK = 0x0021; + /** & */ static final char AMPERSAND = 0x0026; + /** < */ static final char LESSTHAN_SIGN = 0x003C; + /** > */ static final char GREATERTHAN_SIGN = 0x003E; static final char SOLIDUS = 0x002F; static final char QUESTION_MARK = 0x003F; @@ -31,7 +34,9 @@ final class Characters { static final int EOF = -1; static final char TAB = 0x0009; + /** \r */ static final char CR = 0x000D; + /** \n */ static final char LF = 0x000A; static final char FF = 0x000C; static final char SPACE = 0x0020; diff --git a/src/main/java/ch/digitalfondue/jfiveparse/EntitiesPrefix.java b/src/main/java/ch/digitalfondue/jfiveparse/EntitiesPrefix.java index e977336..696c140 100644 --- a/src/main/java/ch/digitalfondue/jfiveparse/EntitiesPrefix.java +++ b/src/main/java/ch/digitalfondue/jfiveparse/EntitiesPrefix.java @@ -19,7 +19,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.Map; import java.util.TreeMap; import java.util.zip.GZIPInputStream; diff --git a/src/main/java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java b/src/main/java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java index 75826c1..2c07b72 100644 --- a/src/main/java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java +++ b/src/main/java/ch/digitalfondue/jfiveparse/ProcessedInputStream.java @@ -19,82 +19,170 @@ import java.io.Reader; /** - * Even though the html5 specification is working with codepoints, this input - * stream will only emit chars and "-1". - * - * This has some interesting consequences that we will need to fully explore: - * + * Wrapped and abstracted input. Can most likely be optimized. */ abstract class ProcessedInputStream { - private boolean crFound; protected final ResizableIntBuffer buffer = new ResizableIntBuffer(); protected abstract int read(); - static class StringProcessedInputStream extends ProcessedInputStream { - protected int pos = 0; - protected final char[] input; + int readUntil(ResizableCharBuilder builder, boolean stopAtAmpersand, boolean stopAtLessThan) { + int chr; + while (!buffer.isEmpty) { + chr = buffer.removeFirst(); + if ((stopAtAmpersand && chr == Characters.AMPERSAND) || (stopAtLessThan && chr == Characters.LESSTHAN_SIGN) || chr == Characters.NULL || chr == Characters.EOF) { + return chr; + } + builder.append((char) chr); + } + return readUntilInternal(builder, stopAtAmpersand, stopAtLessThan); + } - StringProcessedInputStream(String input) { - this.input = input.toCharArray(); + int readUntilAttributeValue(ResizableCharBuilder builder, int quoteChar, boolean stopAtAmpersand) { + int chr; + while (!buffer.isEmpty) { + chr = buffer.removeFirst(); + if (chr == quoteChar || (stopAtAmpersand && chr == Characters.AMPERSAND) || chr == Characters.NULL || chr == Characters.EOF) { + return chr; + } + builder.append((char) chr); } + return readUntilAttributeValueInternal(builder, quoteChar, stopAtAmpersand); + } - @Override - protected int read() { - try { - return input[pos++]; - } catch (IndexOutOfBoundsException s) { - return -1; + private static boolean mustStopReadUntilAttributeValueUnquoted(int chr) { + return Common.isTabLfFfCrOrSpace(chr) || chr == Characters.AMPERSAND || chr == Characters.GREATERTHAN_SIGN + || chr == Characters.NULL || chr == Characters.QUOTATION_MARK || + chr == Characters.APOSTROPHE || chr == Characters.LESSTHAN_SIGN || + chr == Characters.EQUALS_SIGN || chr == Characters.GRAVE_ACCENT || chr == Characters.EOF; + } + + int readUntilAttributeValueUnquoted(ResizableCharBuilder builder) { + int chr; + while (!buffer.isEmpty) { + chr = buffer.removeFirst(); + if (mustStopReadUntilAttributeValueUnquoted(chr)) { + return chr; } + builder.append((char) chr); } + return readUntilAttributeValueUnquotedInternal(builder); } - static final class ReaderProcessedInputStream extends ProcessedInputStream { + int readUntilTagName(ResizableCharBuilder builder) { + int chr; + while (!buffer.isEmpty) { + chr = buffer.removeFirst(); + if (Common.isTabLfFfCrOrSpace(chr) || chr == Characters.SOLIDUS || chr == Characters.GREATERTHAN_SIGN || chr == Characters.NULL || chr == Characters.EOF) { + return chr; + } + builder.append((char) chr); + } + return readUntilTagNameInternal(builder); + } - private final Reader reader; + private static boolean mustStopReadUntilAttributeName(int chr) { + return Common.isTabLfFfCrOrSpace(chr) || chr == Characters.SOLIDUS || chr == Characters.EQUALS_SIGN || chr == Characters.GREATERTHAN_SIGN || chr == Characters.NULL || + chr == Characters.QUOTATION_MARK || chr == Characters.APOSTROPHE || chr == Characters.LESSTHAN_SIGN || chr == Characters.EOF; + } - ReaderProcessedInputStream(Reader reader) { - this.reader = reader; + int readUntilAttributeName(ResizableCharBuilder builder) { + int chr; + while (!buffer.isEmpty) { + chr = buffer.removeFirst(); + if (mustStopReadUntilAttributeName(chr)) { + return chr; + } + builder.append((char) chr); } + return readUntilAttributeNameInternal(builder); + } - @Override - protected int read() { - try { - return reader.read(); - } catch (IOException ioe) { - throw new ParserException(ioe); + int readUntilComment(ResizableCharBuilder builder) { + int chr; + while (!buffer.isEmpty) { + chr = buffer.removeFirst(); + if (chr == Characters.HYPHEN_MINUS || chr == Characters.NULL || chr == Characters.EOF) { + return chr; } + builder.append((char) chr); } + return readUntilCommentInternal(builder); } - // - private int readWithCRHandling() { - int chr = read(); - if (crFound) { - //chr = handleCrFoundInternal(chr); - crFound = false; - if (chr == Characters.LF) { - chr = read(); + protected int readUntilInternal(ResizableCharBuilder builder, boolean stopAtAmpersand, boolean stopAtLessThan) { + int chr; + while ((chr = read()) != Characters.EOF) { + if ((stopAtAmpersand && chr == Characters.AMPERSAND) || (stopAtLessThan && chr == Characters.LESSTHAN_SIGN) || chr == Characters.NULL) { + return chr; + } + builder.append((char) chr); + } + return Characters.EOF; + } + + protected int readUntilAttributeValueInternal(ResizableCharBuilder builder, int quoteChar, boolean stopAtAmpersand) { + int chr; + while ((chr = read()) != Characters.EOF) { + if (chr == quoteChar || (stopAtAmpersand && chr == Characters.AMPERSAND) || chr == Characters.NULL) { + return chr; + } + builder.append((char) chr); + } + return Characters.EOF; + } + + protected int readUntilAttributeValueUnquotedInternal(ResizableCharBuilder builder) { + int chr; + while ((chr = read()) != Characters.EOF) { + if (mustStopReadUntilAttributeValueUnquoted(chr)) { + return chr; } + builder.append((char) chr); } + return Characters.EOF; + } - if (chr == Characters.CR) { - // handleChrIsCR - crFound = true; - chr = Characters.LF; + protected int readUntilTagNameInternal(ResizableCharBuilder builder) { + int chr; + while ((chr = read()) != Characters.EOF) { + if (Common.isTabLfFfCrOrSpace(chr) || chr == Characters.SOLIDUS || chr == Characters.GREATERTHAN_SIGN || chr == Characters.NULL) { + return chr; + } + builder.append((char) chr); } - return chr; + return Characters.EOF; } + protected int readUntilAttributeNameInternal(ResizableCharBuilder builder) { + int chr; + while ((chr = read()) != Characters.EOF) { + if (mustStopReadUntilAttributeName(chr)) { + return chr; + } + builder.append((char) chr); + } + return Characters.EOF; + } + + protected int readUntilCommentInternal(ResizableCharBuilder builder) { + int chr; + while ((chr = read()) != Characters.EOF) { + if (chr == Characters.HYPHEN_MINUS || chr == Characters.NULL) { + return chr; + } + builder.append((char) chr); + } + return Characters.EOF; + } + + // int peekNextInputCharacter(int offset) { if (buffer.length() < offset) { // fill buffer for (int i = buffer.length(); i < offset; i++) { - buffer.add(readWithCRHandling()); + buffer.add(read()); } } return buffer.getCharAt(offset); @@ -111,7 +199,7 @@ int getNextInputCharacterAndConsume() { } int consume() { - return buffer.isEmpty ? readWithCRHandling() : buffer.removeFirst(); + return buffer.isEmpty ? read() : buffer.removeFirst(); } void reconsume(int chr) { @@ -123,4 +211,186 @@ void consume(int count) { consume(); } } + + + + static class StringProcessedInputStream extends ProcessedInputStream { + private int pos = 0; + private final char[] input; + private final int length; + + + StringProcessedInputStream(String input) { + char[] toNormalize = input.toCharArray(); + int j = 0; + for (int i = 0; i < toNormalize.length; i++) { + char c = toNormalize[i]; + if (c == Characters.CR) { + toNormalize[j++] = Characters.LF; + if (i + 1 < toNormalize.length && toNormalize[i + 1] == Characters.LF) { + i++; + } + } else { + toNormalize[j++] = c; + } + } + this.input = toNormalize; + this.length = j; + } + + // used for test + protected int getCharAt(int pos) { + if (pos >= length) { + return Characters.EOF; + } + return input[pos]; + } + + @Override + protected int read() { + if (pos < length) { + return input[pos++]; + } + return Characters.EOF; + } + + @Override + protected int readUntilInternal(ResizableCharBuilder builder, boolean stopAtAmpersand, boolean stopAtLessThan) { + int n = length; + int i = pos; + while (i < n) { + char c = input[i]; + if ((stopAtAmpersand && c == Characters.AMPERSAND) || (stopAtLessThan && c == Characters.LESSTHAN_SIGN) || c == Characters.NULL) { + builder.append(input, pos, i - pos); + pos = i + 1; + return c; + } + i++; + } + builder.append(input, pos, n - pos); + pos = n; + return Characters.EOF; + } + + @Override + protected int readUntilAttributeValueInternal(ResizableCharBuilder builder, int quoteChar, boolean stopAtAmpersand) { + int n = length; + int i = pos; + while (i < n) { + char c = input[i]; + if (c == quoteChar || (stopAtAmpersand && c == Characters.AMPERSAND) || c == Characters.NULL) { + builder.append(input, pos, i - pos); + pos = i + 1; + return c; + } + i++; + } + builder.append(input, pos, n - pos); + pos = n; + return Characters.EOF; + } + + @Override + protected int readUntilAttributeValueUnquotedInternal(ResizableCharBuilder builder) { + int n = length; + int i = pos; + while (i < n) { + char c = input[i]; + if (mustStopReadUntilAttributeValueUnquoted(c)) { + builder.append(input, pos, i - pos); + pos = i + 1; + return c; + } + i++; + } + builder.append(input, pos, n - pos); + pos = n; + return Characters.EOF; + } + + @Override + protected int readUntilTagNameInternal(ResizableCharBuilder builder) { + int n = length; + int i = pos; + while (i < n) { + char c = input[i]; + if (Common.isTabLfFfCrOrSpace(c) || c == Characters.SOLIDUS || c == Characters.GREATERTHAN_SIGN || c == Characters.NULL) { + builder.append(input, pos, i - pos); + pos = i + 1; + return c; + } + i++; + } + builder.append(input, pos, n - pos); + pos = n; + return Characters.EOF; + } + + @Override + protected int readUntilAttributeNameInternal(ResizableCharBuilder builder) { + int n = length; + int i = pos; + while (i < n) { + char c = input[i]; + if (mustStopReadUntilAttributeName(c)) { + builder.append(input, pos, i - pos); // append remaining + pos = i + 1; + return c; + } + i++; + } + builder.append(input, pos, n - pos); + pos = n; + return Characters.EOF; + } + + @Override + protected int readUntilCommentInternal(ResizableCharBuilder builder) { + int n = length; + int i = pos; + while (i < n) { + char c = input[i]; + if (c == Characters.HYPHEN_MINUS || c == Characters.NULL) { + builder.append(input, pos, i - pos); + pos = i + 1; + return c; + } + i++; + } + builder.append(input, pos, n - pos); + pos = n; + return Characters.EOF; + } + } + + static final class ReaderProcessedInputStream extends ProcessedInputStream { + + private final Reader reader; + private boolean crFound; + + ReaderProcessedInputStream(Reader reader) { + this.reader = reader; + } + + @Override + protected int read() { + try { + int chr = reader.read(); + if (crFound) { + crFound = false; + if (chr == Characters.LF) { + chr = reader.read(); + } + } + + if (chr == Characters.CR) { + crFound = true; + chr = Characters.LF; + } + return chr; + } catch (IOException ioe) { + throw new ParserException(ioe); + } + } + } } diff --git a/src/main/java/ch/digitalfondue/jfiveparse/ResizableCharBuilder.java b/src/main/java/ch/digitalfondue/jfiveparse/ResizableCharBuilder.java index 90c24d0..5873407 100644 --- a/src/main/java/ch/digitalfondue/jfiveparse/ResizableCharBuilder.java +++ b/src/main/java/ch/digitalfondue/jfiveparse/ResizableCharBuilder.java @@ -53,12 +53,10 @@ int pos() { } void append(char c) { - try { - buff[pos++] = c; - } catch (IndexOutOfBoundsException e) { + if (pos == buff.length) { buff = Arrays.copyOf(buff, buff.length * 2 + 2); - buff[pos - 1] = c; } + buff[pos++] = c; } String toLowerCase() { @@ -103,5 +101,13 @@ char[] copyBackingCharArray() { return Arrays.copyOf(buff, pos); } + void append(char[] c, int offset, int length) { + if (pos + length >= buff.length) { + buff = Arrays.copyOf(buff, Math.max(pos + length, buff.length * 2 + 2)); + } + System.arraycopy(c, offset, buff, pos, length); + pos += length; + } + } diff --git a/src/main/java/ch/digitalfondue/jfiveparse/Tokenizer.java b/src/main/java/ch/digitalfondue/jfiveparse/Tokenizer.java index 563b3eb..0141829 100644 --- a/src/main/java/ch/digitalfondue/jfiveparse/Tokenizer.java +++ b/src/main/java/ch/digitalfondue/jfiveparse/Tokenizer.java @@ -36,8 +36,8 @@ final class Tokenizer { // tag related private Attributes attributes; - private final ResizableCharBuilder currentAttributeName = new ResizableCharBuilder(); - private ResizableCharBuilder currentAttributeValue; + final ResizableCharBuilder currentAttributeName = new ResizableCharBuilder(); + ResizableCharBuilder currentAttributeValue; private int currentAttributeQuoteType; private boolean selfClosing; final ResizableCharBuilder tagName = new ResizableCharBuilder(); @@ -51,7 +51,7 @@ final class Tokenizer { private StringBuilder doctypeSystemIdentifier; // comment related - private ResizableCharBuilder commentToken; + ResizableCharBuilder commentToken; // private final ResizableCharBuilder temporaryBuffer = new ResizableCharBuilder(); diff --git a/src/main/java/ch/digitalfondue/jfiveparse/TokenizerState.java b/src/main/java/ch/digitalfondue/jfiveparse/TokenizerState.java index 6fdbdab..cf20553 100644 --- a/src/main/java/ch/digitalfondue/jfiveparse/TokenizerState.java +++ b/src/main/java/ch/digitalfondue/jfiveparse/TokenizerState.java @@ -159,36 +159,32 @@ static void handleEndTagOpenState(Tokenizer tokenizer, ProcessedInputStream proc } static void handleTagNameState(Tokenizer tokenizer, ProcessedInputStream processedInputStream) { - // bypass and optimization, as we are accumulating the tag name, we can do it here - // in a single loop, avoiding method calls - do { - int chr = processedInputStream.getNextInputCharacterAndConsume(); - switch (chr) { - case Characters.TAB: - case Characters.LF: - case Characters.FF: - case Characters.SPACE: - tokenizer.setState(BEFORE_ATTRIBUTE_NAME_STATE); - return; - case Characters.SOLIDUS: - tokenizer.setState(SELF_CLOSING_START_TAG_STATE); - return; - case Characters.GREATERTHAN_SIGN: - tokenizer.setState(DATA_STATE); - tokenizer.addCurrentAttributeAndEmitToken(); - return; - case Characters.NULL: - tokenizer.emitParseError(); - tokenizer.appendCurrentTagToken(Characters.REPLACEMENT_CHARACTER); - return; - case Characters.EOF: - tokenizer.emitParseErrorAndSetState(DATA_STATE); - processedInputStream.reconsume(chr); - return; - default: - tokenizer.tagName.append((char) chr); - } - } while (true); + int chr = processedInputStream.readUntilTagName(tokenizer.tagName); + switch (chr) { + case Characters.TAB: + case Characters.LF: + case Characters.FF: + case Characters.SPACE: + tokenizer.setState(BEFORE_ATTRIBUTE_NAME_STATE); + return; + case Characters.SOLIDUS: + tokenizer.setState(SELF_CLOSING_START_TAG_STATE); + return; + case Characters.GREATERTHAN_SIGN: + tokenizer.setState(DATA_STATE); + tokenizer.addCurrentAttributeAndEmitToken(); + return; + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.appendCurrentTagToken(Characters.REPLACEMENT_CHARACTER); + return; + case Characters.EOF: + tokenizer.emitParseErrorAndSetState(DATA_STATE); + processedInputStream.reconsume(chr); + return; + default: + break; + } } static void handleSelfClosingStartTagState(Tokenizer tokenizer, ProcessedInputStream processedInputStream) { @@ -229,7 +225,30 @@ static void handleRCDataState(Tokenizer tokenizer, ProcessedInputStream processe tokenizer.emitEOF(); // does nothing break; default: + int previousInsertionMode = tokenizer.getTokenHandlerInsertionMode(); tokenizer.emitCharacter(chr); + int currentInsertionMode = tokenizer.getTokenHandlerInsertionMode(); + ResizableCharBuilder textNode = tokenizer.getTokenHandlerInsertCharacterPreviousTextNode(); + if (tokenizer.getState() == RCDATA_STATE && previousInsertionMode == currentInsertionMode && textNode != null) { + int internalChr = processedInputStream.readUntil(textNode, true, true); + switch (internalChr) { + case Characters.AMPERSAND: + tokenizer.setState(CHARACTER_REFERENCE_IN_RCDATA_STATE); + return; + case Characters.LESSTHAN_SIGN: + tokenizer.setState(RCDATA_LESS_THAN_SIGN_STATE); + return; + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.emitCharacter(Characters.REPLACEMENT_CHARACTER); + return; + case Characters.EOF: + tokenizer.emitEOF(); + return; + default: + break; + } + } break; } } @@ -344,24 +363,21 @@ static void handleScriptDataState(Tokenizer tokenizer, ProcessedInputStream proc // optimization: bypass if possible if (tokenizer.getState() == SCRIPT_DATA_STATE && previousInsertionMode == currentInsertionMode && textNode != null) { - for (;;) { - int internalChr = processedInputStream.getNextInputCharacterAndConsume(); - switch (internalChr) { - case Characters.EOF: - tokenizer.resetTokenHandlerInsertCharacterPreviousTextNode(); - tokenizer.emitEOF(); - return; - case Characters.NULL: - tokenizer.emitParseError(); - tokenizer.emitCharacter(Characters.REPLACEMENT_CHARACTER); - return; - case Characters.LESSTHAN_SIGN: - tokenizer.setState(SCRIPT_DATA_LESS_THAN_SIGN_STATE); - return; - default: - textNode.append((char) internalChr); - break; - } + int internalChr = processedInputStream.readUntil(textNode, false, true); + switch (internalChr) { + case Characters.EOF: + tokenizer.resetTokenHandlerInsertCharacterPreviousTextNode(); + tokenizer.emitEOF(); + return; + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.emitCharacter(Characters.REPLACEMENT_CHARACTER); + return; + case Characters.LESSTHAN_SIGN: + tokenizer.setState(SCRIPT_DATA_LESS_THAN_SIGN_STATE); + return; + default: + break; } } break; @@ -800,7 +816,24 @@ static void handlePlainTextState(Tokenizer tokenizer, ProcessedInputStream proce tokenizer.emitEOF(); break; default: + int previousInsertionMode = tokenizer.getTokenHandlerInsertionMode(); tokenizer.emitCharacter(chr); + int currentInsertionMode = tokenizer.getTokenHandlerInsertionMode(); + ResizableCharBuilder textNode = tokenizer.getTokenHandlerInsertCharacterPreviousTextNode(); + if (tokenizer.getState() == PLAINTEXT_STATE && previousInsertionMode == currentInsertionMode && textNode != null) { + int internalChr = processedInputStream.readUntil(textNode, false, false); + switch (internalChr) { + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.emitCharacter(Characters.REPLACEMENT_CHARACTER); + return; + case Characters.EOF: + tokenizer.emitEOF(); + return; + default: + break; + } + } break; } } @@ -827,25 +860,20 @@ static void handleRawtextState(Tokenizer tokenizer, ProcessedInputStream process int currentInsertionMode = tokenizer.getTokenHandlerInsertionMode(); ResizableCharBuilder textNode = tokenizer.getTokenHandlerInsertCharacterPreviousTextNode(); if (tokenizer.getState() == RAWTEXT_STATE && previousInsertionMode == currentInsertionMode && textNode != null) { - - for (;;) { - int internalChr = processedInputStream.getNextInputCharacterAndConsume(); - switch (internalChr) { - case Characters.LESSTHAN_SIGN: - tokenizer.resetTokenHandlerInsertCharacterPreviousTextNode(); - tokenizer.setState(RAWTEXT_LESS_THAN_SIGN_STATE); - return; - case Characters.NULL: - tokenizer.emitParseError(); - tokenizer.emitCharacter(Characters.REPLACEMENT_CHARACTER); - return; - case Characters.EOF: - tokenizer.resetTokenHandlerInsertCharacterPreviousTextNode(); - tokenizer.emitEOF(); - return; - default: - textNode.append((char) internalChr); - } + int internalChr = processedInputStream.readUntil(textNode, false, true); + switch (internalChr) { + case Characters.LESSTHAN_SIGN: + tokenizer.setState(RAWTEXT_LESS_THAN_SIGN_STATE); + return; + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.emitCharacter(Characters.REPLACEMENT_CHARACTER); + return; + case Characters.EOF: + tokenizer.emitEOF(); + return; + default: + break; } } break; @@ -1543,25 +1571,23 @@ static void handleCommentStartDashState(Tokenizer tokenizer, ProcessedInputStrea } static void handleCommentState(Tokenizer tokenizer, ProcessedInputStream processedInputStream) { - do { - int chr = processedInputStream.getNextInputCharacterAndConsume(); - switch (chr) { - case Characters.HYPHEN_MINUS: - tokenizer.setState(COMMENT_END_DASH_STATE); - return; - case Characters.NULL: - tokenizer.emitParseError(); - tokenizer.appendCommentCharacter(Characters.REPLACEMENT_CHARACTER); - return; - case Characters.EOF: - tokenizer.emitParseErrorAndSetState(DATA_STATE); - tokenizer.emitComment(); - processedInputStream.reconsume(chr); - return; - default: - tokenizer.appendCommentCharacter(chr); - } - } while (true); + int chr = processedInputStream.readUntilComment(tokenizer.commentToken); + switch (chr) { + case Characters.HYPHEN_MINUS: + tokenizer.setState(COMMENT_END_DASH_STATE); + return; + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.appendCommentCharacter(Characters.REPLACEMENT_CHARACTER); + return; + case Characters.EOF: + tokenizer.emitParseErrorAndSetState(DATA_STATE); + tokenizer.emitComment(); + processedInputStream.reconsume(chr); + return; + default: + break; + } } static void handleCommentEndDashState(Tokenizer tokenizer, ProcessedInputStream processedInputStream) { @@ -1997,29 +2023,26 @@ static void handleDataState(Tokenizer tokenizer, ProcessedInputStream processedI && (currentInsertionMode == TreeConstructor.IM_IN_BODY || currentInsertionMode == TreeConstructor.IM_IN_CELL) && tokenizer.isTokenHandlerInHtmlContent() && textNode != null) { - for (;;) { - int internalChr = processedInputStream.getNextInputCharacterAndConsume(); - switch (internalChr) { - case Characters.EOF: - tokenizer.resetTokenHandlerInsertCharacterPreviousTextNode(); - tokenizer.emitEOF(); - return; - case Characters.NULL: - tokenizer.emitParseError(); - tokenizer.emitCharacter(internalChr); - return; - case Characters.AMPERSAND: - tokenizer.resetTokenHandlerInsertCharacterPreviousTextNode(); - tokenizer.setState(CHARACTER_REFERENCE_IN_DATA_STATE); - return; - case Characters.LESSTHAN_SIGN: - tokenizer.resetTokenHandlerInsertCharacterPreviousTextNode(); - tokenizer.setState(TAG_OPEN_STATE); - return; - default: - textNode.append((char) internalChr); - break; - } + int internalChr = processedInputStream.readUntil(textNode, true, true); + switch (internalChr) { + case Characters.EOF: + tokenizer.resetTokenHandlerInsertCharacterPreviousTextNode(); + tokenizer.emitEOF(); + return; + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.emitCharacter(internalChr); + return; + case Characters.AMPERSAND: + tokenizer.resetTokenHandlerInsertCharacterPreviousTextNode(); + tokenizer.setState(CHARACTER_REFERENCE_IN_DATA_STATE); + return; + case Characters.LESSTHAN_SIGN: + tokenizer.resetTokenHandlerInsertCharacterPreviousTextNode(); + tokenizer.setState(TAG_OPEN_STATE); + return; + default: + break; } } @@ -2105,45 +2128,41 @@ static void handleBeforeAttributeNameState(Tokenizer tokenizer, ProcessedInputSt } static void handleAttributeNameState(Tokenizer tokenizer, ProcessedInputStream processedInputStream) { - // vvv optimization vvv, we try to bypass as much as possible for the case "appendCurrentAttributeName" - do { - int chr = processedInputStream.getNextInputCharacterAndConsume(); - switch (chr) { - case Characters.TAB: - case Characters.LF: - case Characters.FF: - case Characters.SPACE: - tokenizer.setState(AFTER_ATTRIBUTE_NAME_STATE); - return; - case Characters.SOLIDUS: - tokenizer.setState(SELF_CLOSING_START_TAG_STATE); - return; - case Characters.EQUALS_SIGN: - tokenizer.setState(BEFORE_ATTRIBUTE_VALUE_STATE); - return; - case Characters.GREATERTHAN_SIGN: - tokenizer.setState(DATA_STATE); - tokenizer.addCurrentAttributeAndEmitToken(); - return; - case Characters.NULL: - tokenizer.emitParseError(); - tokenizer.appendCurrentAttributeName(Characters.REPLACEMENT_CHARACTER); - return; - case Characters.QUOTATION_MARK: - case Characters.APOSTROPHE: - case Characters.LESSTHAN_SIGN: - tokenizer.emitParseError(); - tokenizer.appendCurrentAttributeName(chr); - return; - case Characters.EOF: - tokenizer.emitParseErrorAndSetState(DATA_STATE); - processedInputStream.reconsume(chr); - return; - default: - tokenizer.appendCurrentAttributeName(chr); - break; - } - } while (true); + int chr = processedInputStream.readUntilAttributeName(tokenizer.currentAttributeName); + switch (chr) { + case Characters.TAB: + case Characters.LF: + case Characters.FF: + case Characters.SPACE: + tokenizer.setState(AFTER_ATTRIBUTE_NAME_STATE); + return; + case Characters.SOLIDUS: + tokenizer.setState(SELF_CLOSING_START_TAG_STATE); + return; + case Characters.EQUALS_SIGN: + tokenizer.setState(BEFORE_ATTRIBUTE_VALUE_STATE); + return; + case Characters.GREATERTHAN_SIGN: + tokenizer.setState(DATA_STATE); + tokenizer.addCurrentAttributeAndEmitToken(); + return; + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.appendCurrentAttributeName(Characters.REPLACEMENT_CHARACTER); + return; + case Characters.QUOTATION_MARK: + case Characters.APOSTROPHE: + case Characters.LESSTHAN_SIGN: + tokenizer.emitParseError(); + tokenizer.appendCurrentAttributeName(chr); + return; + case Characters.EOF: + tokenizer.emitParseErrorAndSetState(DATA_STATE); + processedInputStream.reconsume(chr); + return; + default: + break; + } } static void handleAfterAttributeNameState(Tokenizer tokenizer, ProcessedInputStream processedInputStream) { @@ -2243,100 +2262,93 @@ static void handleBeforeAttributeValueState(Tokenizer tokenizer, ProcessedInputS static void handleAttributeValueDoubleQuotedState(Tokenizer tokenizer, ProcessedInputStream processedInputStream) { // vvv optimization vvv, we try to append as much as possible - do { - int chr = processedInputStream.getNextInputCharacterAndConsume(); - switch (chr) { - case Characters.QUOTATION_MARK: - tokenizer.setState(AFTER_ATTRIBUTE_VALUE_QUOTED_STATE); - return; - case Characters.AMPERSAND: - // save current state - tokenizer.setPreviousState(ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE); - // - tokenizer.setState(CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE); - tokenizer.additionalAllowedCharacter = Characters.QUOTATION_MARK; - return; - case Characters.NULL: - tokenizer.emitParseError(); - tokenizer.appendCurrentAttributeValue(Characters.REPLACEMENT_CHARACTER); - return; - case Characters.EOF: - tokenizer.emitParseErrorAndSetState(DATA_STATE); - processedInputStream.reconsume(chr); - return; - default: - tokenizer.appendCurrentAttributeValue(chr); - } - } while (true); + + int chr = processedInputStream.readUntilAttributeValue(tokenizer.currentAttributeValue, Characters.QUOTATION_MARK, true); + switch (chr) { + case Characters.QUOTATION_MARK: + tokenizer.setState(AFTER_ATTRIBUTE_VALUE_QUOTED_STATE); + return; + case Characters.AMPERSAND: + // save current state + tokenizer.setPreviousState(ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE); + // + tokenizer.setState(CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE); + tokenizer.additionalAllowedCharacter = Characters.QUOTATION_MARK; + return; + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.appendCurrentAttributeValue(Characters.REPLACEMENT_CHARACTER); + return; + case Characters.EOF: + tokenizer.emitParseErrorAndSetState(DATA_STATE); + processedInputStream.reconsume(chr); + return; + } } static void handleAttributeValueSingleQuotedState(Tokenizer tokenizer, ProcessedInputStream processedInputStream) { - do { - int chr = processedInputStream.getNextInputCharacterAndConsume(); - switch (chr) { - case Characters.APOSTROPHE: - tokenizer.setState(AFTER_ATTRIBUTE_VALUE_QUOTED_STATE); - return; - case Characters.AMPERSAND: - // save current state - tokenizer.setPreviousState(ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE); - // - tokenizer.setState(CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE); - tokenizer.additionalAllowedCharacter = Characters.APOSTROPHE; - return; - case Characters.NULL: - tokenizer.emitParseError(); - tokenizer.appendCurrentAttributeValue(Characters.REPLACEMENT_CHARACTER); - return; - case Characters.EOF: - tokenizer.emitParseErrorAndSetState(DATA_STATE); - processedInputStream.reconsume(chr); - return; - default: - tokenizer.appendCurrentAttributeValue(chr); - } - } while (true); + int chr = processedInputStream.readUntilAttributeValue(tokenizer.currentAttributeValue, Characters.APOSTROPHE, true); + switch (chr) { + case Characters.APOSTROPHE: + tokenizer.setState(AFTER_ATTRIBUTE_VALUE_QUOTED_STATE); + return; + case Characters.AMPERSAND: + // save current state + tokenizer.setPreviousState(ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE); + // + tokenizer.setState(CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE); + tokenizer.additionalAllowedCharacter = Characters.APOSTROPHE; + return; + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.appendCurrentAttributeValue(Characters.REPLACEMENT_CHARACTER); + return; + case Characters.EOF: + tokenizer.emitParseErrorAndSetState(DATA_STATE); + processedInputStream.reconsume(chr); + return; + default: + break; + } } static void handleAttributeValueUnquotedState(Tokenizer tokenizer, ProcessedInputStream processedInputStream) { - do { - int chr = processedInputStream.getNextInputCharacterAndConsume(); - switch (chr) { - case Characters.TAB: - case Characters.LF: - case Characters.FF: - case Characters.SPACE: - tokenizer.setState(BEFORE_ATTRIBUTE_NAME_STATE); - return; - case Characters.AMPERSAND: - tokenizer.setPreviousState(ATTRIBUTE_VALUE_UNQUOTED_STATE); - tokenizer.setState(CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE); - tokenizer.additionalAllowedCharacter = Characters.GREATERTHAN_SIGN; - return; - case Characters.GREATERTHAN_SIGN: - tokenizer.setState(DATA_STATE); - tokenizer.addCurrentAttributeAndEmitToken(); - return; - case Characters.NULL: - tokenizer.emitParseError(); - tokenizer.appendCurrentAttributeValue(Characters.REPLACEMENT_CHARACTER); - return; - case Characters.QUOTATION_MARK: - case Characters.APOSTROPHE: - case Characters.LESSTHAN_SIGN: - case Characters.EQUALS_SIGN: - case Characters.GRAVE_ACCENT: - tokenizer.emitParseError(); - tokenizer.appendCurrentAttributeValue(chr); - return; - case Characters.EOF: - tokenizer.emitParseErrorAndSetState(DATA_STATE); - processedInputStream.reconsume(chr); - return; - default: - tokenizer.appendCurrentAttributeValue(chr); - } - } while (true); + int chr = processedInputStream.readUntilAttributeValueUnquoted(tokenizer.currentAttributeValue); + switch (chr) { + case Characters.TAB: + case Characters.LF: + case Characters.FF: + case Characters.SPACE: + tokenizer.setState(BEFORE_ATTRIBUTE_NAME_STATE); + return; + case Characters.AMPERSAND: + tokenizer.setPreviousState(ATTRIBUTE_VALUE_UNQUOTED_STATE); + tokenizer.setState(CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE); + tokenizer.additionalAllowedCharacter = Characters.GREATERTHAN_SIGN; + return; + case Characters.GREATERTHAN_SIGN: + tokenizer.setState(DATA_STATE); + tokenizer.addCurrentAttributeAndEmitToken(); + return; + case Characters.NULL: + tokenizer.emitParseError(); + tokenizer.appendCurrentAttributeValue(Characters.REPLACEMENT_CHARACTER); + return; + case Characters.QUOTATION_MARK: + case Characters.APOSTROPHE: + case Characters.LESSTHAN_SIGN: + case Characters.EQUALS_SIGN: + case Characters.GRAVE_ACCENT: + tokenizer.emitParseError(); + tokenizer.appendCurrentAttributeValue(chr); + return; + case Characters.EOF: + tokenizer.emitParseErrorAndSetState(DATA_STATE); + processedInputStream.reconsume(chr); + return; + default: + break; + } } static void handleCharacterReferenceInAttributeValueState(Tokenizer tokenizer, ProcessedInputStream processedInputStream) { diff --git a/src/test/java/ch/digitalfondue/jfiveparse/ProcessedInputStreamWithParseError.java b/src/test/java/ch/digitalfondue/jfiveparse/ProcessedInputStreamWithParseError.java index cd5c8c5..1cd2459 100644 --- a/src/test/java/ch/digitalfondue/jfiveparse/ProcessedInputStreamWithParseError.java +++ b/src/test/java/ch/digitalfondue/jfiveparse/ProcessedInputStreamWithParseError.java @@ -45,14 +45,6 @@ int consume() { return chr; } - private int getCharAt(int position) { - try { - return input[position]; - } catch (IndexOutOfBoundsException s) { - return -1; - } - } - private int getCurrentInputCharacter() { if (buffer.length() > 0) { return buffer.getCharAt(0);