diff --git a/build.gradle b/build.gradle deleted file mode 100644 index fe3256b..0000000 --- a/build.gradle +++ /dev/null @@ -1,154 +0,0 @@ -group 'net.dankito.readability4j' -version '1.0.8' -def mavenArtifactId = "readability4j" - -buildscript { - ext { - kotlin_version = '1.3.72' - - slf4jVersion = '1.7.25' - - jsoupVersion = '1.11.2' - - jacksonVersion = '2.9.2' - - logbackVersion = '1.2.3' - - diffUtilsVersion = '2.2' - - okHttpVersion = '3.9.1' - } - - repositories { - mavenCentral() - } - dependencies { - classpath "org.jetbrains.kotlin:kotlin-gradle-plugin:$kotlin_version" - classpath "io.codearte.gradle.nexus:gradle-nexus-staging-plugin:0.10.0" - } -} - - -apply plugin: 'java' -apply plugin: 'kotlin' - -apply plugin: 'maven' -apply plugin: 'signing' - -// So after executing uploadArchives staged repository can be closed and released by executing closeAndReleaseRepository -apply plugin: 'io.codearte.nexus-staging' - - -sourceCompatibility = 1.7 // for Android use compatibility with Java 7 - -repositories { - mavenCentral() -} - -dependencies { - compile "org.jetbrains.kotlin:kotlin-stdlib:$kotlin_version" - - compile "org.slf4j:slf4j-api:$slf4jVersion" - - compile "org.jsoup:jsoup:$jsoupVersion" - - - testCompile "junit:junit:4.12" - - testCompile "com.fasterxml.jackson.module:jackson-module-kotlin:$jacksonVersion" - - testCompile "com.github.wumpz:diffutils:$diffUtilsVersion" - - testCompile "com.squareup.okhttp3:okhttp:$okHttpVersion" - - testCompile "ch.qos.logback:logback-core:$logbackVersion" - testCompile "ch.qos.logback:logback-classic:$logbackVersion" -} - - -compileKotlin { - kotlinOptions.jvmTarget = "1.6" -} - -compileTestKotlin { - kotlinOptions.jvmTarget = "1.6" -} - - - -/* publish to maven central */ - -// set ossrhUsername and ossrhPassword in your gradle.properties (in ~/.gradle/gradle.properties or project's gradle.properties) -def areOssrhPropertiesSet = isPropertySet('ossrhUsername') && isPropertySet('ossrhPassword') - -def isPropertySet(propertyName) { - return properties[propertyName] != null && ! (properties[propertyName] as String).isEmpty() -} - -task javadocJar(type: Jar) { - classifier = 'javadoc' - from javadoc -} - -task sourcesJar(type: Jar) { - classifier = 'sources' - from sourceSets.main.allSource -} - -artifacts { - archives javadocJar, sourcesJar -} - -signing { - sign configurations.archives -} - -uploadArchives { - repositories { - mavenDeployer { - beforeDeployment { MavenDeployment deployment -> signing.signPom(deployment) } - - repository url: repositories.mavenLocal().url - - if (areOssrhPropertiesSet) { - repository(url: "https://oss.sonatype.org/service/local/staging/deploy/maven2/") { - authentication(userName: ossrhUsername, password: ossrhPassword) - } - - snapshotRepository(url: "https://oss.sonatype.org/content/repositories/snapshots/") { - authentication(userName: ossrhUsername, password: ossrhPassword) - } - } - - pom.project { - name mavenArtifactId - artifactId = mavenArtifactId - packaging 'jar' - // optionally artifactId can be defined here - description "A Kotlin port of Mozilla‘s Readability. It extracts a website‘s relevant content and removes all clutter from it." - url 'https://github.com/dankito/Readability4J' - - scm { - connection 'scm:git:git://github.com/dankito/Readability4J.git' - developerConnection 'scm:git:git@github.com:dankito/Readability4J.git' - url 'https://github.com/dankito/Readability4J' - } - - licenses { - license { - name 'The Apache License, Version 2.0' - url 'http://www.apache.org/licenses/LICENSE-2.0.txt' - } - } - - developers { - developer { - id 'dankito' - name 'Christian Dankl' - email 'maven@dankito.net' - } - } - } - } - } -} \ No newline at end of file diff --git a/build.gradle.kts b/build.gradle.kts new file mode 100644 index 0000000..47e6ca8 --- /dev/null +++ b/build.gradle.kts @@ -0,0 +1,121 @@ +plugins { + java + id("org.jetbrains.kotlin.jvm") version "1.9.10" + id("maven-publish") + id("signing") +} + +group = "net.dankito.readability4j" +version = "1.0.8" + +val mavenArtifactId = "readability4j" + +object versions { + const val kotlin = "1.9.10" + const val slf4j = "2.0.9" + const val jsoup = "1.16.1" + const val jackson = "2.15.2" + const val logback = "1.4.11" + const val diffUtils = "4.15" + const val okHttp = "4.11.0" + const val junit = "4.13.2" +} + +java { + sourceCompatibility = JavaVersion.VERSION_1_8 + targetCompatibility = JavaVersion.VERSION_1_8 +} + +repositories { + mavenCentral() +} + +dependencies { + implementation("org.jetbrains.kotlin:kotlin-stdlib-jdk8:${versions.kotlin}") + implementation("org.slf4j:slf4j-api:${versions.slf4j}") + implementation("org.jsoup:jsoup:${versions.jsoup}") + + testImplementation("junit:junit:${versions.junit}") + testImplementation("com.fasterxml.jackson.module:jackson-module-kotlin:${versions.jackson}") + testImplementation("io.github.java-diff-utils:java-diff-utils:${versions.diffUtils}") + testImplementation("com.squareup.okhttp3:okhttp:${versions.okHttp}") + testImplementation("ch.qos.logback:logback-core:${versions.logback}") + testImplementation("ch.qos.logback:logback-classic:${versions.logback}") + testImplementation("org.jetbrains.kotlin:kotlin-test:${versions.kotlin}") +} + +tasks.withType { + kotlinOptions { + jvmTarget = "1.8" + } +} + +// Tasks for generating additional artifacts +tasks.register("javadocJar") { + archiveClassifier.set("javadoc") + from(tasks.javadoc) +} + +tasks.register("sourcesJar") { + archiveClassifier.set("sources") + from(sourceSets.main.get().allSource) +} + +publishing { + publications { + create("mavenJava") { + from(components["java"]) + + artifact(tasks.named("javadocJar")) + artifact(tasks.named("sourcesJar")) + + pom { + name.set(mavenArtifactId) + description.set("A Kotlin port of Mozilla's Readability. It extracts a website's relevant content and removes all clutter from it.") + url.set("https://github.com/dankito/Readability4J") + + scm { + connection.set("scm:git:git://github.com/dankito/Readability4J.git") + developerConnection.set("scm:git:git@github.com:dankito/Readability4J.git") + url.set("https://github.com/dankito/Readability4J") + } + + licenses { + license { + name.set("The Apache License, Version 2.0") + url.set("http://www.apache.org/licenses/LICENSE-2.0.txt") + } + } + + developers { + developer { + id.set("dankito") + name.set("Christian Dankl") + email.set("maven@dankito.net") + } + } + } + } + } + + repositories { + maven { + name = "OSSRH" + url = uri("https://oss.sonatype.org/service/local/staging/deploy/maven2/") + + credentials { + username = project.findProperty("ossrhUsername") as String? ?: "" + password = project.findProperty("ossrhPassword") as String? ?: "" + } + } + } +} + +signing { + useInMemoryPgpKeys( + project.findProperty("signing.keyId") as String? ?: "", + project.findProperty("signing.secretKeyRingFile") as String? ?: "", + project.findProperty("signing.password") as String? ?: "" + ) + sign(publishing.publications["mavenJava"]) +} diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar index 0c6e54a..249e583 100644 Binary files a/gradle/wrapper/gradle-wrapper.jar and b/gradle/wrapper/gradle-wrapper.jar differ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index 7256b96..a20f9a0 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,6 +1,6 @@ -#Tue Nov 21 09:38:12 CET 2017 +#Mon Nov 25 13:38:40 CET 2024 distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-8.10-bin.zip zipStoreBase=GRADLE_USER_HOME -zipStorePath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-6.2.1-all.zip +zipStorePath=wrapper/dists \ No newline at end of file diff --git a/gradlew b/gradlew index 4453cce..1b6c787 100755 --- a/gradlew +++ b/gradlew @@ -1,78 +1,129 @@ -#!/usr/bin/env sh +#!/bin/sh + +# +# Copyright © 2015-2021 the original authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ############################################################################## -## -## Gradle start up script for UN*X -## +# +# Gradle start up script for POSIX generated by Gradle. +# +# Important for running: +# +# (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is +# noncompliant, but you have some other compliant shell such as ksh or +# bash, then to run this script, type that shell name before the whole +# command line, like: +# +# ksh Gradle +# +# Busybox and similar reduced shells will NOT work, because this script +# requires all of these POSIX shell features: +# * functions; +# * expansions «$var», «${var}», «${var:-default}», «${var+SET}», +# «${var#prefix}», «${var%suffix}», and «$( cmd )»; +# * compound commands having a testable exit status, especially «case»; +# * various built-in commands including «command», «set», and «ulimit». +# +# Important for patching: +# +# (2) This script targets any POSIX shell, so it avoids extensions provided +# by Bash, Ksh, etc; in particular arrays are avoided. +# +# The "traditional" practice of packing multiple parameters into a +# space-separated string is a well documented source of bugs and security +# problems, so this is (mostly) avoided, by progressively accumulating +# options in "$@", and eventually passing that to Java. +# +# Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS, +# and GRADLE_OPTS) rely on word-splitting, this is performed explicitly; +# see the in-line comments for details. +# +# There are tweaks for specific operating systems such as AIX, CygWin, +# Darwin, MinGW, and NonStop. +# +# (3) This script is generated from the Groovy template +# https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt +# within the Gradle project. +# +# You can find Gradle at https://github.com/gradle/gradle/. +# ############################################################################## # Attempt to set APP_HOME + # Resolve links: $0 may be a link -PRG="$0" -# Need this for relative symlinks. -while [ -h "$PRG" ] ; do - ls=`ls -ld "$PRG"` - link=`expr "$ls" : '.*-> \(.*\)$'` - if expr "$link" : '/.*' > /dev/null; then - PRG="$link" - else - PRG=`dirname "$PRG"`"/$link" - fi +app_path=$0 + +# Need this for daisy-chained symlinks. +while + APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path + [ -h "$app_path" ] +do + ls=$( ls -ld "$app_path" ) + link=${ls#*' -> '} + case $link in #( + /*) app_path=$link ;; #( + *) app_path=$APP_HOME$link ;; + esac done -SAVED="`pwd`" -cd "`dirname \"$PRG\"`/" >/dev/null -APP_HOME="`pwd -P`" -cd "$SAVED" >/dev/null + +APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit APP_NAME="Gradle" -APP_BASE_NAME=`basename "$0"` +APP_BASE_NAME=${0##*/} # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -DEFAULT_JVM_OPTS="" +DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' # Use the maximum available, or set MAX_FD != -1 to use that value. -MAX_FD="maximum" +MAX_FD=maximum -warn ( ) { +warn () { echo "$*" -} +} >&2 -die ( ) { +die () { echo echo "$*" echo exit 1 -} +} >&2 # OS specific support (must be 'true' or 'false'). cygwin=false msys=false darwin=false nonstop=false -case "`uname`" in - CYGWIN* ) - cygwin=true - ;; - Darwin* ) - darwin=true - ;; - MINGW* ) - msys=true - ;; - NONSTOP* ) - nonstop=true - ;; +case "$( uname )" in #( + CYGWIN* ) cygwin=true ;; #( + Darwin* ) darwin=true ;; #( + MSYS* | MINGW* ) msys=true ;; #( + NONSTOP* ) nonstop=true ;; esac CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + # Determine the Java command to use to start the JVM. if [ -n "$JAVA_HOME" ] ; then if [ -x "$JAVA_HOME/jre/sh/java" ] ; then # IBM's JDK on AIX uses strange locations for the executables - JAVACMD="$JAVA_HOME/jre/sh/java" + JAVACMD=$JAVA_HOME/jre/sh/java else - JAVACMD="$JAVA_HOME/bin/java" + JAVACMD=$JAVA_HOME/bin/java fi if [ ! -x "$JAVACMD" ] ; then die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME @@ -81,7 +132,7 @@ Please set the JAVA_HOME variable in your environment to match the location of your Java installation." fi else - JAVACMD="java" + JAVACMD=java which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. Please set the JAVA_HOME variable in your environment to match the @@ -89,84 +140,95 @@ location of your Java installation." fi # Increase the maximum file descriptors if we can. -if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then - MAX_FD_LIMIT=`ulimit -H -n` - if [ $? -eq 0 ] ; then - if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then - MAX_FD="$MAX_FD_LIMIT" - fi - ulimit -n $MAX_FD - if [ $? -ne 0 ] ; then - warn "Could not set maximum file descriptor limit: $MAX_FD" - fi - else - warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" - fi +if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then + case $MAX_FD in #( + max*) + MAX_FD=$( ulimit -H -n ) || + warn "Could not query maximum file descriptor limit" + esac + case $MAX_FD in #( + '' | soft) :;; #( + *) + ulimit -n "$MAX_FD" || + warn "Could not set maximum file descriptor limit to $MAX_FD" + esac fi -# For Darwin, add options to specify how the application appears in the dock -if $darwin; then - GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" -fi +# Collect all arguments for the java command, stacking in reverse order: +# * args from the command line +# * the main class name +# * -classpath +# * -D...appname settings +# * --module-path (only if needed) +# * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. + +# For Cygwin or MSYS, switch paths to Windows format before running java +if "$cygwin" || "$msys" ; then + APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) + CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) + + JAVACMD=$( cygpath --unix "$JAVACMD" ) -# For Cygwin, switch paths to Windows format before running java -if $cygwin ; then - APP_HOME=`cygpath --path --mixed "$APP_HOME"` - CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` - JAVACMD=`cygpath --unix "$JAVACMD"` - - # We build the pattern for arguments to be converted via cygpath - ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` - SEP="" - for dir in $ROOTDIRSRAW ; do - ROOTDIRS="$ROOTDIRS$SEP$dir" - SEP="|" - done - OURCYGPATTERN="(^($ROOTDIRS))" - # Add a user-defined pattern to the cygpath arguments - if [ "$GRADLE_CYGPATTERN" != "" ] ; then - OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" - fi # Now convert the arguments - kludge to limit ourselves to /bin/sh - i=0 - for arg in "$@" ; do - CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` - CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option - - if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition - eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` - else - eval `echo args$i`="\"$arg\"" + for arg do + if + case $arg in #( + -*) false ;; # don't mess with options #( + /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath + [ -e "$t" ] ;; #( + *) false ;; + esac + then + arg=$( cygpath --path --ignore --mixed "$arg" ) fi - i=$((i+1)) + # Roll the args list around exactly as many times as the number of + # args, so each arg winds up back in the position where it started, but + # possibly modified. + # + # NB: a `for` loop captures its iteration list before it begins, so + # changing the positional parameters here affects neither the number of + # iterations, nor the values presented in `arg`. + shift # remove old arg + set -- "$@" "$arg" # push replacement arg done - case $i in - (0) set -- ;; - (1) set -- "$args0" ;; - (2) set -- "$args0" "$args1" ;; - (3) set -- "$args0" "$args1" "$args2" ;; - (4) set -- "$args0" "$args1" "$args2" "$args3" ;; - (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; - (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; - (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; - (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; - (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; - esac fi -# Escape application args -save ( ) { - for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done - echo " " -} -APP_ARGS=$(save "$@") - -# Collect all arguments for the java command, following the shell quoting and substitution rules -eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" - -# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong -if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then - cd "$(dirname "$0")" -fi +# Collect all arguments for the java command; +# * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of +# shell script including quotes and variable substitutions, so put them in +# double quotes to make sure that they get re-expanded; and +# * put everything else in single quotes, so that it's not re-expanded. + +set -- \ + "-Dorg.gradle.appname=$APP_BASE_NAME" \ + -classpath "$CLASSPATH" \ + org.gradle.wrapper.GradleWrapperMain \ + "$@" + +# Use "xargs" to parse quoted args. +# +# With -n1 it outputs one arg per line, with the quotes and backslashes removed. +# +# In Bash we could simply go: +# +# readarray ARGS < <( xargs -n1 <<<"$var" ) && +# set -- "${ARGS[@]}" "$@" +# +# but POSIX shell has neither arrays nor command substitution, so instead we +# post-process each arg (as a line of input to sed) to backslash-escape any +# character that might be a shell metacharacter, then use eval to reverse +# that process (while maintaining the separation between arguments), and wrap +# the whole thing up as a single "set" statement. +# +# This will of course break if any of these variables contains a newline or +# an unmatched quote. +# + +eval "set -- $( + printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | + xargs -n1 | + sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | + tr '\n' ' ' + )" '"$@"' exec "$JAVACMD" "$@" diff --git a/src/main/kotlin/net/dankito/readability4j/processor/ArticleGrabber.kt b/src/main/kotlin/net/dankito/readability4j/processor/ArticleGrabber.kt index ae329f5..76feb13 100644 --- a/src/main/kotlin/net/dankito/readability4j/processor/ArticleGrabber.kt +++ b/src/main/kotlin/net/dankito/readability4j/processor/ArticleGrabber.kt @@ -472,12 +472,12 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v var next = node while(next.parent() != null) { - ancestors.add(next.parent()) + ancestors.add(next.parent()!!) if(++i == maxDepth) { break } - next = next.parent() + next = next.parent()!! } return ancestors @@ -1004,11 +1004,11 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v return false } - if(parent.parent().tagName() == tagNameLowerCase && (filterFn == null || filterFn(parent.parent()))) { + if(parent.parent()!!.tagName() == tagNameLowerCase && (filterFn == null || filterFn(parent.parent()!!))) { return true } - parent = parent.parent() + parent = parent.parent()!! depth++ } @@ -1115,7 +1115,7 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v protected open fun getTextDirection(topCandidate: Element, doc: Document) { val ancestors = mutableSetOf(topCandidate.parent(), topCandidate) - ancestors.addAll(getNodeAncestors(topCandidate.parent())) + ancestors.addAll(getNodeAncestors(topCandidate.parent()!!)) ancestors.add(doc.body()) ancestors.add(doc.selectFirst("html")) // needed as dir is often set on html tag diff --git a/src/main/kotlin/net/dankito/readability4j/processor/MetadataParser.kt b/src/main/kotlin/net/dankito/readability4j/processor/MetadataParser.kt index a84e782..20f72f3 100644 --- a/src/main/kotlin/net/dankito/readability4j/processor/MetadataParser.kt +++ b/src/main/kotlin/net/dankito/readability4j/processor/MetadataParser.kt @@ -97,7 +97,7 @@ open class MetadataParser(protected val regEx: RegExUtil = RegExUtil()): Process else if(curTitle.contains(": ")) { // Check if we have an heading containing this exact string, so we // could assume it's the full title. - val match = doc.select("h1, h2").filter { it.wholeText() == curTitle }.size > 0 + val match = doc.select("h1, h2").any { it.wholeText() == curTitle } // If we don't, let's extract the title out of the original title string. if(match == false) { diff --git a/src/main/kotlin/net/dankito/readability4j/processor/Preprocessor.kt b/src/main/kotlin/net/dankito/readability4j/processor/Preprocessor.kt index 660a2d5..ad72947 100644 --- a/src/main/kotlin/net/dankito/readability4j/processor/Preprocessor.kt +++ b/src/main/kotlin/net/dankito/readability4j/processor/Preprocessor.kt @@ -130,7 +130,7 @@ open class Preprocessor(protected val regEx: RegExUtil = RegExUtil()) : Processo // all sibling nodes as children of the

until we hit another
// chain. if(replaced) { - val p = br.ownerDocument().createElement("p") + val p = br.ownerDocument()!!.createElement("p") br.replaceWith(p) next = p.nextSibling() diff --git a/src/test/kotlin/net/dankito/readability4j/Readability4JTestBase.kt b/src/test/kotlin/net/dankito/readability4j/Readability4JTestBase.kt index 2adece9..76c4158 100644 --- a/src/test/kotlin/net/dankito/readability4j/Readability4JTestBase.kt +++ b/src/test/kotlin/net/dankito/readability4j/Readability4JTestBase.kt @@ -2,10 +2,13 @@ package net.dankito.readability4j import com.fasterxml.jackson.databind.DeserializationFeature import com.fasterxml.jackson.databind.ObjectMapper -import com.github.difflib.DiffUtils +import junit.framework.TestCase.assertEquals import net.dankito.readability4j.model.ArticleMetadata import net.dankito.readability4j.model.PageTestData import net.dankito.readability4j.model.ReadabilityOptions +import org.jsoup.Jsoup +import org.jsoup.nodes.Document +import org.jsoup.safety.Safelist import java.io.BufferedReader import java.io.File import java.io.FileReader @@ -61,19 +64,19 @@ abstract class Readability4JTestBase { val article = underTest.parse() - val expected = getExpectedText(testData) - val actual = getActualText(article, testData) - - assert(actual == expected) { - "Expected:\n${expected}\n\nActual:\n${actual}\n\nDiff:\n${DiffUtils.diff(expected, actual).deltas.joinToString("\n")}" - } + val expected: Document = cleanParseHtml(getExpectedText(testData)) + val actual: Document = cleanParseHtml(getActualText(article, testData)) + assertEquals(expected.html(), actual.html()) testMetadata(testData, article) return article } + private fun cleanParseHtml(text: String?): Document = + Jsoup.parse(Jsoup.clean(text!!, Safelist.relaxed())) + protected open fun createReadability4J(url: String, testData: PageTestData): Readability4J { // Provide one class name to preserve, which we know appears in a few // of the test documents. diff --git a/src/test/kotlin/net/dankito/readability4j/util/TestDataGenerator.kt b/src/test/kotlin/net/dankito/readability4j/util/TestDataGenerator.kt index 1ae9b32..67beb28 100644 --- a/src/test/kotlin/net/dankito/readability4j/util/TestDataGenerator.kt +++ b/src/test/kotlin/net/dankito/readability4j/util/TestDataGenerator.kt @@ -67,7 +67,7 @@ class TestDataGenerator : TestDataGeneratorBase() { val response = executeRequest(request, DefaultCountRetries) - return response.body()?.string() ?: "" + return response.body?.string() ?: "" } catch (e: Exception) { log.error("Could not retrieve response from url $url", e) throw e