diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..3b01cf1 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,14 @@ +# Editorconfig on (https://editorconfig.org/) + +root = true + +[*] +end_of_line = lf +charset = utf-8 +insert_final_newline = true + +[*.java] +indent_size = 4 + +[*.kt] +indent_size = 4 \ No newline at end of file diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..d536530 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "src/test/resources/readability"] + path = src/test/resources/readability + url = https://github.com/mozilla/readability/ diff --git a/README.md b/README.md index b5cf9a6..bc03f2a 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Readability4J -[![Maven Central](https://maven-badges.herokuapp.com/maven-central/net.dankito.readability4j/readability4j/badge.svg)](https://maven-badges.herokuapp.com/maven-central/net.dankito.readability4j/readability4j) +[![JitPack](https://jitpack.io/v/NotDroidUser/Readability4J.svg)](https://jitpack.io/v/NotDroidUser/Readability4J.svg) Readability4J is a Kotlin port of Mozilla's Readability.js, which is used for Firefox's reader view: https://github.com/mozilla/readability. @@ -7,32 +7,37 @@ It tries to detect the relevant content of a website and removes all clutter fro The extracted text then can be used for indexing web pages, to provide the user a pleasant reading experience and similar. -As it‘s compatible with Mozilla‘s Readability.js it produces exact the same output as you would see in Firefox‘s Reader View (just some white spaces differ due to Jsoup‘s different formatting, but you can‘t see them anyway). +As it‘s compatible with Mozilla‘s Readability.js it produces almost exact the same output as you would see in Firefox‘s Reader View (just some differ due to Jsoup‘s don't behave exactly in some cases, yet some things that you can‘t see them anyway). ## Setup -Gradle: -``` -dependencies { - compile 'net.dankito.readability4j:readability4j:1.0.8' -} -``` +Add it in your root settings.gradle at the end of repositories: -Maven: -``` - - net.dankito.readability4j - readability4j - 1.0.8 - +```groovy + dependencyResolutionManagement { + repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS) + repositories { + mavenCentral() + maven { url 'https://jitpack.io' } + } + } ``` +Step 2. Add the dependency + +```groovy + dependencies { + implementation 'com.github.NotDroidUser:Readability4J:2.0.0-jitpack-beta' + } +``` ## Usage +From Java: + ```java -String url = ...; -String html = ...; +String url = "some-page.com"; +String html = "Some Bloated Article html source"; Readability4J readability4J = new Readability4J(url, html); // url is just needed to resolve relative urls Article article = readability4J.parse(); @@ -46,38 +51,80 @@ String title = article.getTitle(); String byline = article.getByline(); String excerpt = article.getExcerpt(); ``` +From Kotlin: -## Readability4J and Readability4JExtended +```kotlin -With Readability4J class I wanted to stick close to Mozilla's Readability to keep compatibility. +val url = "somepage.com" +val html = "Some Bloated Article html source" -But during development I found some handy features not supported by Readability, e. g. copying url from data-src -attribute to <img src="" /> to display lazy loading images, using <head><base>'s href value for resolving -relative urls and a -better -detection of -which -images to keep in output. +val readability4J = Readability4J(url, html) // url is just needed to resolve relative urls +val article = readability4J.parse() -These features I implemented in Readability4JExtended. +// returns extracted content in a
element +val extractedContentHtml = article.getContent() +// to get content wrapped in tags and encoding set to UTF-8, see chapter 'Output encoding' +val extractedContentHtmlWithUtf8Encoding = article.getContentWithUtf8Encoding() +val extractedContentPlainText = article.getTextContent() +val title = article.getTitle() +val byline = article.getByline() +val excerpt = article.getExcerpt() -If you want to use it, simply instantiate with (the rest of the code stays the same): +``` -
-Readability4J readability4J = new Readability4JExtended(url, html);
-Article article = readability4J.parse();
-
+# Why i can't use Readability4JExtended now? + + +As readability code changed a lot from the latest commit (2018-2025), had first updated Readability4J code base to make the updating process the less stressfully, yet you can do some alike with classes like: + +On Java: + +```java +String url = "some-specific-page.com"; +String html = "Some Bloated Article html source that needs extra steps"; + +Readability4J readability4J = Readability4J(url, html); +ArticleGrabber extended = new ArticleGrabber(readability4J.getOptions(),new BaseRegexUtilExtended()); +readability4J.setArticleGrabber(extended); +``` + +On Kotlin: + +```kotlin +val url = "some-specific-page.com" +val html = "Some Bloated Article html source that needs extra steps" + +val readability4J = Readability4J(url, html) +readability4J.articleGrabber = ArticleGrabber(readability4J.options,BaseRegexUtilExtended()) +``` + +Yet some of original Readability4JExtended like data-src was implemented on the original one (srcset regex for example) + + ## Output encoding -As users noted (see Issue [#1](https://github.com/dankito/Readability4J/issues/1) and [#2](https://github.com/dankito/Readability4J/issues/2)) -by default no encoding is applied to Readability4J's output resulting in incorrect display of non-ASCII characters. +As users noted (see Issue [#1](https://github.com/dankito/Readability4J/issues/1) and [#2](https://github.com/dankito/Readability4J/issues/2)) by default no encoding is applied to Readability4J's output resulting in incorrect display of non-ASCII characters. -The reason is like Readability.js Readability4J returns its output in a <div> element, and the only way to set the -encoding in HTML is in a <head><meta charset=""> tag. +The reason is like Readability.js Readability4J returns its output in a `
` element, and the only way to set the encoding in HTML is in a ` ` tag. -So I added these convenience methods to Article class +So I added these convenience methods to Article class: +On Java: ```java String contentHtmlWithUtf8Encoding = article.getContentWithUtf8Encoding(); // or (tries to apply site's charset, if set, or if not uses UTF-8 as fallback @@ -86,12 +133,22 @@ String contentWithDocumentsCharsetOrUtf8 = article.getContentWithDocumentsCharse String contentHtmlWithCustomEncoding = article.getContentWithEncoding("ISO-8859-1"); ``` -which wrap the content in +On Kotlin: + +```kotlin +var contentHtmlWithUtf8Encoding = article.contentWithUtf8Encoding +// or (tries to apply site's charset, if set, or if not uses UTF-8 as fallback +var contentWithDocumentsCharsetOrUtf8 = article.contentWithDocumentsCharsetOrUtf8 +// or +var contentHtmlWithCustomEncoding = article.getContentWithEncoding("ISO-8859-1") +``` + +Which wrap the content in: ``` - + @@ -101,16 +158,16 @@ which wrap the content in ## Compatibility with Mozilla‘s Readability.js -As mentioned before, this is almost an exact copy of Mozilla's Readability.js. But since I didn't find the original code very readable itself, I extracted some parts from the 2000 lines of code into a new classes: +As mentioned before, this is almost an exact copy of Mozilla's Readability.js. But since the code in only one file can be almost unreadable, I extracted some parts from the 2000+ lines of code into a new classes: - + - - + + @@ -121,19 +178,20 @@ As mentioned before, this is almost an exact copy of Mozilla's Readability.js. B - - + +
Readability.js function - Readability4J location + Readability.js functionReadability4J location
_removeScripts() and _prepDocument()Preprocessor.prepareDocument()_unwrapNoscriptImages(), _removeScripts() and _prepDocument()Preprocessor.unwrapNoscriptImages(), Preprocessor.removeScripts() and Preprocessor.prepDocument()
_grabArticle()Postprocessor.postProcessContent()
_getArticleMetadata()MetadataParser.getArticleMetadata()_getJSONLD(),_getArticleMetadata()MetadataParser.getJSONLD(), MetadataParser.getArticleMetadata()
+I added some log functions on Util.kt so the nodes are logged as on Javascript for compare in test cases, also done a rollback to the latest compatible Jackson with Android API 19-25 Overview of which Mozilla‘s Readability.js commit a Readability4J version matches: - + + @@ -145,11 +203,25 @@ Overview of which Mozilla‘s Readability.js commit a Readability4J version matc + + + + + + + + + +
Version - Commit - Date + VersionCommitDate
1.0834672e 02/27/18
2.0.0-betaalmost all test from [v0.6.0](https://github.com/mozilla/readability/commit/04fd32f72b448c12b02ba6c40928b67e510bac49) works13/10/25
2.1.0-rconly 4 failing test (with minor differences) [d7949dc4](https://github.com/mozilla/readability/commit/d7949dc4) works12/1/26
+## Testing + +I had added readability.js as a submodule so it will be updated with their latest tests, also i don't get their results for done, i do a call to the readability.js inside HTMLUnit, with some regex changes, syntactic [see rhino compat](https://mozilla.github.io/rhino/compat/engines.html#ES2015-syntax-spread-syntax-for-iterable-objects) and non syntactic as it can run as a function than a class + ## Extensibility -I tried to create the library as extensible as possible. All above mentioned classes can be overwritten and passed to Readability4J's constructor. +I tried to maintain the library as extensible as possible. All above mentioned classes can be overwritten and passed to Readability4J's as a variable assignment. ## Logging @@ -159,7 +231,7 @@ So you can use any logger that supports slf4j, like Logback and log4j, to config # License - Copyright 2017 dankito + Copyright 2017 dankito 2025 NotDroidUser Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/build.gradle b/build.gradle deleted file mode 100644 index fe3256b..0000000 --- a/build.gradle +++ /dev/null @@ -1,154 +0,0 @@ -group 'net.dankito.readability4j' -version '1.0.8' -def mavenArtifactId = "readability4j" - -buildscript { - ext { - kotlin_version = '1.3.72' - - slf4jVersion = '1.7.25' - - jsoupVersion = '1.11.2' - - jacksonVersion = '2.9.2' - - logbackVersion = '1.2.3' - - diffUtilsVersion = '2.2' - - okHttpVersion = '3.9.1' - } - - repositories { - mavenCentral() - } - dependencies { - classpath "org.jetbrains.kotlin:kotlin-gradle-plugin:$kotlin_version" - classpath "io.codearte.gradle.nexus:gradle-nexus-staging-plugin:0.10.0" - } -} - - -apply plugin: 'java' -apply plugin: 'kotlin' - -apply plugin: 'maven' -apply plugin: 'signing' - -// So after executing uploadArchives staged repository can be closed and released by executing closeAndReleaseRepository -apply plugin: 'io.codearte.nexus-staging' - - -sourceCompatibility = 1.7 // for Android use compatibility with Java 7 - -repositories { - mavenCentral() -} - -dependencies { - compile "org.jetbrains.kotlin:kotlin-stdlib:$kotlin_version" - - compile "org.slf4j:slf4j-api:$slf4jVersion" - - compile "org.jsoup:jsoup:$jsoupVersion" - - - testCompile "junit:junit:4.12" - - testCompile "com.fasterxml.jackson.module:jackson-module-kotlin:$jacksonVersion" - - testCompile "com.github.wumpz:diffutils:$diffUtilsVersion" - - testCompile "com.squareup.okhttp3:okhttp:$okHttpVersion" - - testCompile "ch.qos.logback:logback-core:$logbackVersion" - testCompile "ch.qos.logback:logback-classic:$logbackVersion" -} - - -compileKotlin { - kotlinOptions.jvmTarget = "1.6" -} - -compileTestKotlin { - kotlinOptions.jvmTarget = "1.6" -} - - - -/* publish to maven central */ - -// set ossrhUsername and ossrhPassword in your gradle.properties (in ~/.gradle/gradle.properties or project's gradle.properties) -def areOssrhPropertiesSet = isPropertySet('ossrhUsername') && isPropertySet('ossrhPassword') - -def isPropertySet(propertyName) { - return properties[propertyName] != null && ! (properties[propertyName] as String).isEmpty() -} - -task javadocJar(type: Jar) { - classifier = 'javadoc' - from javadoc -} - -task sourcesJar(type: Jar) { - classifier = 'sources' - from sourceSets.main.allSource -} - -artifacts { - archives javadocJar, sourcesJar -} - -signing { - sign configurations.archives -} - -uploadArchives { - repositories { - mavenDeployer { - beforeDeployment { MavenDeployment deployment -> signing.signPom(deployment) } - - repository url: repositories.mavenLocal().url - - if (areOssrhPropertiesSet) { - repository(url: "https://oss.sonatype.org/service/local/staging/deploy/maven2/") { - authentication(userName: ossrhUsername, password: ossrhPassword) - } - - snapshotRepository(url: "https://oss.sonatype.org/content/repositories/snapshots/") { - authentication(userName: ossrhUsername, password: ossrhPassword) - } - } - - pom.project { - name mavenArtifactId - artifactId = mavenArtifactId - packaging 'jar' - // optionally artifactId can be defined here - description "A Kotlin port of Mozilla‘s Readability. It extracts a website‘s relevant content and removes all clutter from it." - url 'https://github.com/dankito/Readability4J' - - scm { - connection 'scm:git:git://github.com/dankito/Readability4J.git' - developerConnection 'scm:git:git@github.com:dankito/Readability4J.git' - url 'https://github.com/dankito/Readability4J' - } - - licenses { - license { - name 'The Apache License, Version 2.0' - url 'http://www.apache.org/licenses/LICENSE-2.0.txt' - } - } - - developers { - developer { - id 'dankito' - name 'Christian Dankl' - email 'maven@dankito.net' - } - } - } - } - } -} \ No newline at end of file diff --git a/build.gradle.kts b/build.gradle.kts new file mode 100644 index 0000000..43c3f08 --- /dev/null +++ b/build.gradle.kts @@ -0,0 +1,101 @@ +import org.jetbrains.kotlin.gradle.dsl.JvmTarget +import org.jetbrains.kotlin.gradle.dsl.KotlinJvmCompilerOptions + +plugins { + java + signing + `maven-publish` + alias(libs.plugins.kotlin.jvm) +} + +group = "net.dankito.readability4j" +version = "2.1.0-rc1" + +val mavenArtifactId = "readability4j" + + +java { + sourceCompatibility = JavaVersion.VERSION_1_8 + targetCompatibility = JavaVersion.VERSION_1_8 +} + +repositories { + mavenCentral() +} + +dependencies { + implementation(libs.kotlin.stdlib) + implementation(libs.jsoup) + implementation(libs.slf4j.api) + implementation(libs.jackson.kotlin) //for LD-Json + + testImplementation(libs.junit) + testImplementation(libs.htmlunit) + testImplementation(libs.jackson.kotlin) + testImplementation(libs.java.diff.utils) + testImplementation(libs.okHttp3) + testImplementation(libs.logback.core) + testImplementation(libs.logback.classic) + testImplementation(libs.kotlin.test) +} + +tasks.withType> { + compilerOptions { + jvmTarget.set(JvmTarget.JVM_1_8) + } +} + +// Tasks for generating additional artifacts +tasks.register("javadocJar") { + archiveClassifier.set("javadoc") + from(tasks.javadoc) +} + +tasks.register("sourcesJar") { + archiveClassifier.set("sources") + from(sourceSets.main.get().allSource) +} + +publishing { + publications { + create("mavenJava") { + from(components["java"]) + + artifact(tasks.named("javadocJar")) + artifact(tasks.named("sourcesJar")) + + pom { + name.set(mavenArtifactId) + description.set("A Kotlin port of Mozilla's Readability. It extracts a website's relevant content and removes all clutter from it.") + url.set("https://github.com/dankito/Readability4J") + + scm { + connection.set("scm:git:git://github.com/dankito/Readability4J.git") + developerConnection.set("scm:git:git@github.com:dankito/Readability4J.git") + url.set("https://github.com/dankito/Readability4J") + } + + licenses { + license { + name.set("The Apache License, Version 2.0") + url.set("http://www.apache.org/licenses/LICENSE-2.0.txt") + } + } + + developers { + developer { + id.set("dankito") + name.set("Christian Dankl") + email.set("maven@dankito.net") + } + developer { + id.set("NotDroidUser") + name.set("Ruben David") + email.set("r1d1p1j1@gmail.com") + } + } + } + } + } + +} diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml new file mode 100644 index 0000000..c6574eb --- /dev/null +++ b/gradle/libs.versions.toml @@ -0,0 +1,27 @@ +[versions] +slf4j = "2.0.17" +jsoup = "1.19.1" +kotlin = "2.0.0" +jackson = "2.13.5" +logback = "1.5.18" +diffUtils = "4.15" +okHttp = "4.12.0" +junitVer = "4.13.2" +htmlunitVer = "4.11.1" + +[libraries] +kotlin-stdlib = { group="org.jetbrains.kotlin", name="kotlin-stdlib-jdk8", version.ref = "kotlin" } +jsoup = { group="org.jsoup", name="jsoup", version.ref = "jsoup" } +slf4j-api= { group="org.slf4j", name="slf4j-api", version.ref="slf4j" } +junit= { group="junit", name="junit", version.ref="junitVer" } +htmlunit= { group="org.htmlunit", name="htmlunit", version.ref="htmlunitVer" } +jackson-kotlin= { group="com.fasterxml.jackson.module", name="jackson-module-kotlin", version.ref="jackson" } +java-diff-utils= { group="io.github.java-diff-utils", name="java-diff-utils", version.ref="diffUtils" } +okHttp3= { group="com.squareup.okhttp3", name="okhttp", version.ref="okHttp" } +logback-core= { group="ch.qos.logback", name="logback-core", version.ref="logback" } +logback-classic= { group="ch.qos.logback", name="logback-classic", version.ref="logback" } +kotlin-test= { group="org.jetbrains.kotlin", name="kotlin-test", version.ref="kotlin" } + +[plugins] +kotlin-jvm = { id = "org.jetbrains.kotlin.jvm", version.ref = "kotlin" } + diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar index 0c6e54a..249e583 100644 Binary files a/gradle/wrapper/gradle-wrapper.jar and b/gradle/wrapper/gradle-wrapper.jar differ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index 7256b96..a20f9a0 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,6 +1,6 @@ -#Tue Nov 21 09:38:12 CET 2017 +#Mon Nov 25 13:38:40 CET 2024 distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-8.10-bin.zip zipStoreBase=GRADLE_USER_HOME -zipStorePath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-6.2.1-all.zip +zipStorePath=wrapper/dists \ No newline at end of file diff --git a/gradlew b/gradlew index 4453cce..1b6c787 100755 --- a/gradlew +++ b/gradlew @@ -1,78 +1,129 @@ -#!/usr/bin/env sh +#!/bin/sh + +# +# Copyright © 2015-2021 the original authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ############################################################################## -## -## Gradle start up script for UN*X -## +# +# Gradle start up script for POSIX generated by Gradle. +# +# Important for running: +# +# (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is +# noncompliant, but you have some other compliant shell such as ksh or +# bash, then to run this script, type that shell name before the whole +# command line, like: +# +# ksh Gradle +# +# Busybox and similar reduced shells will NOT work, because this script +# requires all of these POSIX shell features: +# * functions; +# * expansions «$var», «${var}», «${var:-default}», «${var+SET}», +# «${var#prefix}», «${var%suffix}», and «$( cmd )»; +# * compound commands having a testable exit status, especially «case»; +# * various built-in commands including «command», «set», and «ulimit». +# +# Important for patching: +# +# (2) This script targets any POSIX shell, so it avoids extensions provided +# by Bash, Ksh, etc; in particular arrays are avoided. +# +# The "traditional" practice of packing multiple parameters into a +# space-separated string is a well documented source of bugs and security +# problems, so this is (mostly) avoided, by progressively accumulating +# options in "$@", and eventually passing that to Java. +# +# Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS, +# and GRADLE_OPTS) rely on word-splitting, this is performed explicitly; +# see the in-line comments for details. +# +# There are tweaks for specific operating systems such as AIX, CygWin, +# Darwin, MinGW, and NonStop. +# +# (3) This script is generated from the Groovy template +# https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt +# within the Gradle project. +# +# You can find Gradle at https://github.com/gradle/gradle/. +# ############################################################################## # Attempt to set APP_HOME + # Resolve links: $0 may be a link -PRG="$0" -# Need this for relative symlinks. -while [ -h "$PRG" ] ; do - ls=`ls -ld "$PRG"` - link=`expr "$ls" : '.*-> \(.*\)$'` - if expr "$link" : '/.*' > /dev/null; then - PRG="$link" - else - PRG=`dirname "$PRG"`"/$link" - fi +app_path=$0 + +# Need this for daisy-chained symlinks. +while + APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path + [ -h "$app_path" ] +do + ls=$( ls -ld "$app_path" ) + link=${ls#*' -> '} + case $link in #( + /*) app_path=$link ;; #( + *) app_path=$APP_HOME$link ;; + esac done -SAVED="`pwd`" -cd "`dirname \"$PRG\"`/" >/dev/null -APP_HOME="`pwd -P`" -cd "$SAVED" >/dev/null + +APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit APP_NAME="Gradle" -APP_BASE_NAME=`basename "$0"` +APP_BASE_NAME=${0##*/} # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -DEFAULT_JVM_OPTS="" +DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' # Use the maximum available, or set MAX_FD != -1 to use that value. -MAX_FD="maximum" +MAX_FD=maximum -warn ( ) { +warn () { echo "$*" -} +} >&2 -die ( ) { +die () { echo echo "$*" echo exit 1 -} +} >&2 # OS specific support (must be 'true' or 'false'). cygwin=false msys=false darwin=false nonstop=false -case "`uname`" in - CYGWIN* ) - cygwin=true - ;; - Darwin* ) - darwin=true - ;; - MINGW* ) - msys=true - ;; - NONSTOP* ) - nonstop=true - ;; +case "$( uname )" in #( + CYGWIN* ) cygwin=true ;; #( + Darwin* ) darwin=true ;; #( + MSYS* | MINGW* ) msys=true ;; #( + NONSTOP* ) nonstop=true ;; esac CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + # Determine the Java command to use to start the JVM. if [ -n "$JAVA_HOME" ] ; then if [ -x "$JAVA_HOME/jre/sh/java" ] ; then # IBM's JDK on AIX uses strange locations for the executables - JAVACMD="$JAVA_HOME/jre/sh/java" + JAVACMD=$JAVA_HOME/jre/sh/java else - JAVACMD="$JAVA_HOME/bin/java" + JAVACMD=$JAVA_HOME/bin/java fi if [ ! -x "$JAVACMD" ] ; then die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME @@ -81,7 +132,7 @@ Please set the JAVA_HOME variable in your environment to match the location of your Java installation." fi else - JAVACMD="java" + JAVACMD=java which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. Please set the JAVA_HOME variable in your environment to match the @@ -89,84 +140,95 @@ location of your Java installation." fi # Increase the maximum file descriptors if we can. -if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then - MAX_FD_LIMIT=`ulimit -H -n` - if [ $? -eq 0 ] ; then - if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then - MAX_FD="$MAX_FD_LIMIT" - fi - ulimit -n $MAX_FD - if [ $? -ne 0 ] ; then - warn "Could not set maximum file descriptor limit: $MAX_FD" - fi - else - warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" - fi +if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then + case $MAX_FD in #( + max*) + MAX_FD=$( ulimit -H -n ) || + warn "Could not query maximum file descriptor limit" + esac + case $MAX_FD in #( + '' | soft) :;; #( + *) + ulimit -n "$MAX_FD" || + warn "Could not set maximum file descriptor limit to $MAX_FD" + esac fi -# For Darwin, add options to specify how the application appears in the dock -if $darwin; then - GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" -fi +# Collect all arguments for the java command, stacking in reverse order: +# * args from the command line +# * the main class name +# * -classpath +# * -D...appname settings +# * --module-path (only if needed) +# * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. + +# For Cygwin or MSYS, switch paths to Windows format before running java +if "$cygwin" || "$msys" ; then + APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) + CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) + + JAVACMD=$( cygpath --unix "$JAVACMD" ) -# For Cygwin, switch paths to Windows format before running java -if $cygwin ; then - APP_HOME=`cygpath --path --mixed "$APP_HOME"` - CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` - JAVACMD=`cygpath --unix "$JAVACMD"` - - # We build the pattern for arguments to be converted via cygpath - ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` - SEP="" - for dir in $ROOTDIRSRAW ; do - ROOTDIRS="$ROOTDIRS$SEP$dir" - SEP="|" - done - OURCYGPATTERN="(^($ROOTDIRS))" - # Add a user-defined pattern to the cygpath arguments - if [ "$GRADLE_CYGPATTERN" != "" ] ; then - OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" - fi # Now convert the arguments - kludge to limit ourselves to /bin/sh - i=0 - for arg in "$@" ; do - CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` - CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option - - if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition - eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` - else - eval `echo args$i`="\"$arg\"" + for arg do + if + case $arg in #( + -*) false ;; # don't mess with options #( + /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath + [ -e "$t" ] ;; #( + *) false ;; + esac + then + arg=$( cygpath --path --ignore --mixed "$arg" ) fi - i=$((i+1)) + # Roll the args list around exactly as many times as the number of + # args, so each arg winds up back in the position where it started, but + # possibly modified. + # + # NB: a `for` loop captures its iteration list before it begins, so + # changing the positional parameters here affects neither the number of + # iterations, nor the values presented in `arg`. + shift # remove old arg + set -- "$@" "$arg" # push replacement arg done - case $i in - (0) set -- ;; - (1) set -- "$args0" ;; - (2) set -- "$args0" "$args1" ;; - (3) set -- "$args0" "$args1" "$args2" ;; - (4) set -- "$args0" "$args1" "$args2" "$args3" ;; - (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; - (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; - (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; - (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; - (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; - esac fi -# Escape application args -save ( ) { - for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done - echo " " -} -APP_ARGS=$(save "$@") - -# Collect all arguments for the java command, following the shell quoting and substitution rules -eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" - -# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong -if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then - cd "$(dirname "$0")" -fi +# Collect all arguments for the java command; +# * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of +# shell script including quotes and variable substitutions, so put them in +# double quotes to make sure that they get re-expanded; and +# * put everything else in single quotes, so that it's not re-expanded. + +set -- \ + "-Dorg.gradle.appname=$APP_BASE_NAME" \ + -classpath "$CLASSPATH" \ + org.gradle.wrapper.GradleWrapperMain \ + "$@" + +# Use "xargs" to parse quoted args. +# +# With -n1 it outputs one arg per line, with the quotes and backslashes removed. +# +# In Bash we could simply go: +# +# readarray ARGS < <( xargs -n1 <<<"$var" ) && +# set -- "${ARGS[@]}" "$@" +# +# but POSIX shell has neither arrays nor command substitution, so instead we +# post-process each arg (as a line of input to sed) to backslash-escape any +# character that might be a shell metacharacter, then use eval to reverse +# that process (while maintaining the separation between arguments), and wrap +# the whole thing up as a single "set" statement. +# +# This will of course break if any of these variables contains a newline or +# an unmatched quote. +# + +eval "set -- $( + printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | + xargs -n1 | + sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | + tr '\n' ' ' + )" '"$@"' exec "$JAVACMD" "$@" diff --git a/settings.gradle b/settings.gradle index abe46f2..d6dc94e 100644 --- a/settings.gradle +++ b/settings.gradle @@ -1,2 +1,24 @@ +pluginManagement { + repositories { + google { + content { + includeGroupByRegex("com\\.android.*") + includeGroupByRegex("com\\.google.*") + includeGroupByRegex("androidx.*") + } + } + mavenCentral() + gradlePluginPortal() + } +} +dependencyResolutionManagement { + repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS) + repositories { + google() + mavenCentral() + maven { url 'https://jitpack.io' } + } +} + rootProject.name = 'Readability4J' diff --git a/src/main/kotlin/net/dankito/readability4j/Article.kt b/src/main/kotlin/net/dankito/readability4j/Article.kt index a8d4a26..e91ee79 100644 --- a/src/main/kotlin/net/dankito/readability4j/Article.kt +++ b/src/main/kotlin/net/dankito/readability4j/Article.kt @@ -7,27 +7,45 @@ open class Article( /** * Original uri object that was passed to constructor + * that has no usage and also is in the same context that the Readability4J is called */ - val uri: String - ) { + constructor(uri:String):this(){ + this.uri=uri + } + @Deprecated("This has no sense as you has the url in the context you call Readability4J", + level = DeprecationLevel.WARNING) + var uri: String="" /** * Article title */ var title: String? = null + /** + * The actual html object of the article + * */ var articleContent: Element? = null + /** + * Content lang (from html tag) default to empty + */ + var lang:String? = null + /** * HTML string of processed article content in a <div> element. * - * Therefore no encoding is applied, see [contentWithUtf8Encoding] or issue - * [https://github.com/dankito/Readability4J/issues/1]. + * Therefore no encoding is applied as intended in the js library, + * @see contentWithUtf8Encoding + * @see The github issue. */ val content: String? - get() = articleContent?.html() // TODO: but this removes paging information (pages in top node
) + get() = articleContent?.outerHtml() + + var siteName:String? = null + + var publishedTime:String? = null /** * [content] returns a <div> element. * @@ -38,26 +56,29 @@ open class Article( * So this method wraps [content] in <html><head><meta charset="utf-8"/></head><body><!-- * content--></body></html> so that UTF-8 encoding gets applied. * - * See [https://github.com/dankito/Readability4J/issues/1] for more info. + * @see The issue for more info. */ val contentWithUtf8Encoding: String? get() = getContentWithEncoding("utf-8") /** * Returns the content wrapped in an element with charset set to document's charset. Or if that is not set in UTF-8. - * See [contentWithUtf8Encoding] for more details. + * @see [contentWithUtf8Encoding] for more details. */ val contentWithDocumentsCharsetOrUtf8: String? get() = getContentWithEncoding(charset ?: "utf-8") + /** + * Content text (only text) + */ val textContent: String? get() = articleContent?.text() /** * Length of article, in characters */ - var length: Int = -1 - get() = textContent?.length ?: -1 + val length: Int + get() = textContent?.length ?: -1 /** * Article description, or short excerpt from content @@ -77,6 +98,7 @@ open class Article( /** * Article's charset */ + @Deprecated("Right now all sites uses utf-8", level = DeprecationLevel.WARNING) var charset: String? = null @@ -96,9 +118,8 @@ open class Article( content?.let { content -> return "\n \n \n \n \n " + "$content\n \n" - } - - return null + }?: return null } -} \ No newline at end of file + +} diff --git a/src/main/kotlin/net/dankito/readability4j/Readability4J.kt b/src/main/kotlin/net/dankito/readability4j/Readability4J.kt index 79b0e84..65b95b6 100644 --- a/src/main/kotlin/net/dankito/readability4j/Readability4J.kt +++ b/src/main/kotlin/net/dankito/readability4j/Readability4J.kt @@ -6,69 +6,40 @@ import net.dankito.readability4j.processor.ArticleGrabber import net.dankito.readability4j.processor.MetadataParser import net.dankito.readability4j.processor.Postprocessor import net.dankito.readability4j.processor.Preprocessor -import net.dankito.readability4j.util.RegExUtil import org.jsoup.Jsoup import org.jsoup.nodes.Document import org.jsoup.nodes.Element +import org.slf4j.Logger import org.slf4j.LoggerFactory - - -open class Readability4J { - - companion object { - private val log = LoggerFactory.getLogger(Readability4J::class.java) - } - - - protected val uri: String - - protected val document: Document - - protected val options: ReadabilityOptions - - protected val regEx: RegExUtil - - protected val preprocessor: Preprocessor - - protected val metadataParser: MetadataParser - - protected val articleGrabber: ArticleGrabber - - protected val postprocessor: Postprocessor - - - // TODO: add IDependencyResolver interface and @JvmOverloads - - // for Java interoperability - /** - * Calls Readability(String, String, ReadabilityOptions) with default ReadabilityOptions - */ - constructor(uri: String, html: String) : this(uri, html, ReadabilityOptions()) - - constructor(uri: String, html: String, options: ReadabilityOptions = ReadabilityOptions(), regExUtil: RegExUtil = RegExUtil(), - preprocessor: Preprocessor = Preprocessor(regExUtil), metadataParser: MetadataParser = MetadataParser(regExUtil), - articleGrabber: ArticleGrabber = ArticleGrabber(options, regExUtil), postprocessor: Postprocessor = Postprocessor()) - : this(uri, Jsoup.parse(html, uri), options, regExUtil, preprocessor, metadataParser, articleGrabber, postprocessor) - - // for Java interoperability - /** - * Calls Readability(String, Document, ReadabilityOptions) with default ReadabilityOptions - */ - constructor(uri: String, document: Document) : this(uri, document, ReadabilityOptions()) - - constructor(uri: String, document: Document, options: ReadabilityOptions = ReadabilityOptions(), regExUtil: RegExUtil = RegExUtil(), - preprocessor: Preprocessor = Preprocessor(regExUtil), metadataParser: MetadataParser = MetadataParser(regExUtil), - articleGrabber: ArticleGrabber = ArticleGrabber(options, regExUtil), postprocessor: Postprocessor = Postprocessor()) { - this.uri = uri - this.document = document - this.options = options - - this.regEx = regExUtil - this.preprocessor = preprocessor - this.metadataParser = metadataParser - this.articleGrabber = articleGrabber - this.postprocessor = postprocessor - } +import kotlin.system.measureNanoTime + + +open class Readability4J +/** + * Calls Readability4J with default params if no options provided, + * this constructor uses the uri for the postprocessing and Jsoup, + * as differ of js version keeps the url as you cant call in a html + * text documentUri as they call in the Postprocessor to process the URIs + * to make them absolute + * + * @param uri The uri (for Jsoup and for the Postprocessor) also can be empty string if + * you want to process manually after the article is served and don't waste that time + * @param html The page as string (this for Jsoup) + * @param options optional, if you don't provide it, will be all default options + * @see net.dankito.readability4j.model.ReadabilityOptions + * @see net.dankito.readability4j.processor.Postprocessor + */ +@Throws(ExceptionInInitializerError::class) +@JvmOverloads constructor( + val uri:String, + val html:String, + val options: ReadabilityOptions = ReadabilityOptions(), +) { + private val log: Logger = LoggerFactory.getLogger(Readability4J::class.java) + var metadataParser: MetadataParser = MetadataParser() + var preprocessor: Preprocessor = Preprocessor() + var articleGrabber: ArticleGrabber = ArticleGrabber(options) + var postprocessor: Postprocessor = Postprocessor() /** @@ -82,31 +53,68 @@ open class Readability4J { * 4. Replace the current DOM tree with the new one. * 5. Read peacefully. * + * @return The actual article if the article exists in the html, + * else an empty Article with null content + * @throws RuntimeException if too many elements to parse (As you put in options) + * @see net.dankito.readability4j.Article + * */ + @Throws(RuntimeException::class) open fun parse(): Article { + + val document: Document + + log.info("Time parsing Document:{}",measureNanoTime { + document= Jsoup.parse(html,uri) + }) + // Avoid parsing too large documents, as per configuration option if (options.maxElemsToParse > 0) { - val numTags = document.getElementsByTag("*").size + val numTags = document.count() if(numTags > options.maxElemsToParse) { - throw Exception("Aborting parsing document; $numTags elements found, but ReadabilityOption.maxElemsToParse is set to ${options.maxElemsToParse}") + throw RuntimeException("Aborting parsing document; $numTags elements found, but ReadabilityOption.maxElemsToParse is set to ${options.maxElemsToParse}") } } - val article = Article(uri) + log.info("Time unwraping noscripts :{}",measureNanoTime { + preprocessor.unwrapNoscriptImages(document) + }) - preprocessor.prepareDocument(document) + var jsonLDMetadata:ArticleMetadata?=null + if (!options.disableJSONLD){ + log.info("Time Processing Json-LD :{}",measureNanoTime { + jsonLDMetadata=metadataParser.getJSONLD(document) + }) + } - val metadata = metadataParser.getArticleMetadata(document) + // this one also remove the scripts + log.info("Time Pre-Processing Document :{}",measureNanoTime { + preprocessor.prepareDocument(document) + }) + + val metadata: ArticleMetadata + log.info("Time Parsing Metadata :{}",measureNanoTime { + metadata = metadataParser.getArticleMetadata(document,jsonLDMetadata) + }) + + val articleContent: Element? + log.info("Time Grabbing Article :{}",measureNanoTime { + articleContent = articleGrabber.grabArticle(document, metadata) + }) + + val article = Article() + if (articleContent==null){ + return article.also { setArticleMetadata(article,metadata,null) } + // send a empty result, as nothing are found here + } - val articleContent = articleGrabber.grabArticle(document, metadata) log.debug("Grabbed: {}", articleContent) - articleContent?.let { // TODO: or return null if grabbing didn't work? - postprocessor.postProcessContent(document, articleContent, uri, options.additionalClassesToPreserve) - - article.articleContent = articleContent - } - + log.info("Time Post-Processing Document :{}",measureNanoTime { + //this is removing things af + postprocessor.postProcessContent( articleContent, document.baseUri(), uri, options ) + }) + article.articleContent = articleContent setArticleMetadata(article, metadata, articleContent) return article @@ -116,17 +124,21 @@ open class Readability4J { // If we haven't found an excerpt in the article's metadata, use the article's // first paragraph as the excerpt. This is used for displaying a preview of // the article's content. + if(metadata.excerpt.isNullOrBlank()) { articleContent?.getElementsByTag("p")?.first()?.let { firstParagraph -> - metadata.excerpt = firstParagraph.text().trim() + metadata.excerpt = firstParagraph.wholeText().trim() } } article.title = metadata.title article.byline = if(metadata.byline.isNullOrBlank()) articleGrabber.articleByline else metadata.byline + articleGrabber.articleLang?.let { article.lang= it} article.dir = articleGrabber.articleDir article.excerpt = metadata.excerpt - article.charset = metadata.charset + article.siteName = metadata.siteName + article.publishedTime = metadata.publishedTime + //article.charset = metadata.charset // this doesn't exist anymore in js } -} \ No newline at end of file +} diff --git a/src/main/kotlin/net/dankito/readability4j/extended/Readability4JExtended.kt b/src/main/kotlin/net/dankito/readability4j/extended/Readability4JExtended.kt index 8d1c804..4d0057f 100644 --- a/src/main/kotlin/net/dankito/readability4j/extended/Readability4JExtended.kt +++ b/src/main/kotlin/net/dankito/readability4j/extended/Readability4JExtended.kt @@ -3,12 +3,10 @@ package net.dankito.readability4j.extended import net.dankito.readability4j.Readability4J import net.dankito.readability4j.extended.processor.ArticleGrabberExtended import net.dankito.readability4j.extended.processor.PostprocessorExtended -import net.dankito.readability4j.extended.util.RegExUtilExtended +import net.dankito.readability4j.extended.util.BaseRegexUtilExtended import net.dankito.readability4j.model.ReadabilityOptions import net.dankito.readability4j.processor.MetadataParser import net.dankito.readability4j.processor.Preprocessor -import org.jsoup.Jsoup -import org.jsoup.nodes.Document open class Readability4JExtended : Readability4J { @@ -17,22 +15,21 @@ open class Readability4JExtended : Readability4J { /** * Calls Readability(String, String, ReadabilityOptions) with default ReadabilityOptions */ - constructor(uri: String, html: String) : this(uri, html, ReadabilityOptions()) - constructor(uri: String, html: String, options: ReadabilityOptions = ReadabilityOptions(), regExUtil: RegExUtilExtended = RegExUtilExtended(), - preprocessor: Preprocessor = Preprocessor(regExUtil), metadataParser: MetadataParser = MetadataParser(regExUtil), - articleGrabber: ArticleGrabberExtended = ArticleGrabberExtended(options, regExUtil), postprocessor: PostprocessorExtended = PostprocessorExtended()) - : this(uri, Jsoup.parse(html, uri), options, regExUtil, preprocessor, metadataParser, articleGrabber, postprocessor) - - // for Java interoperability - /** - * Calls Readability(String, Document, ReadabilityOptions) with default ReadabilityOptions - */ - constructor(uri: String, document: Document) : this(uri, document, ReadabilityOptions()) - - constructor(uri: String, document: Document, options: ReadabilityOptions = ReadabilityOptions(), regExUtil: RegExUtilExtended = RegExUtilExtended(), - preprocessor: Preprocessor = Preprocessor(regExUtil), metadataParser: MetadataParser = MetadataParser(regExUtil), - articleGrabber: ArticleGrabberExtended = ArticleGrabberExtended(options, regExUtil), postprocessor: PostprocessorExtended = PostprocessorExtended()) - : super(uri, document, options, regExUtil, preprocessor, metadataParser, articleGrabber, postprocessor) - -} \ No newline at end of file + @JvmOverloads + constructor(uri: String, + html: String, + options: ReadabilityOptions = ReadabilityOptions(), + regExUtil: BaseRegexUtilExtended = BaseRegexUtilExtended(), + preprocessor: Preprocessor = Preprocessor(regExUtil), + metadataParser: MetadataParser = MetadataParser(regExUtil), + articleGrabber: ArticleGrabberExtended = ArticleGrabberExtended(options, regExUtil), + postprocessor: PostprocessorExtended = PostprocessorExtended()) + : super(uri,html,options){ + this.articleGrabber=articleGrabber + this.preprocessor=preprocessor + this.metadataParser=metadataParser + this.postprocessor=postprocessor + } + +} diff --git a/src/main/kotlin/net/dankito/readability4j/extended/processor/ArticleGrabberExtended.kt b/src/main/kotlin/net/dankito/readability4j/extended/processor/ArticleGrabberExtended.kt index 1592e8b..b224f41 100644 --- a/src/main/kotlin/net/dankito/readability4j/extended/processor/ArticleGrabberExtended.kt +++ b/src/main/kotlin/net/dankito/readability4j/extended/processor/ArticleGrabberExtended.kt @@ -1,14 +1,15 @@ package net.dankito.readability4j.extended.processor -import net.dankito.readability4j.extended.util.RegExUtilExtended +import net.dankito.readability4j.extended.util.BaseRegexUtilExtended import net.dankito.readability4j.model.ReadabilityOptions import net.dankito.readability4j.processor.ArticleGrabber -import org.jsoup.nodes.Element +open class ArticleGrabberExtended @JvmOverloads constructor(options: ReadabilityOptions, protected val regExExtended: BaseRegexUtilExtended) : ArticleGrabber(options, regExExtended) { -open class ArticleGrabberExtended(options: ReadabilityOptions, protected val regExExtended: RegExUtilExtended) : ArticleGrabber(options, regExExtended) { + /* + todo do better implementation because - override fun shouldKeepSibling(sibling: Element): Boolean { + override fun shouldKeepSibling(sibling: Element): Boolean { return super.shouldKeepSibling(sibling) || containsImageToKeep(sibling) } @@ -17,7 +18,7 @@ open class ArticleGrabberExtended(options: ReadabilityOptions, protected val reg if(images.size > 0) { if(isImageElementToKeep(element)) { images.forEach { image -> - if(isImageElementToKeep(image) == false) { + if(!isImageElementToKeep(image)) { return false } } @@ -33,6 +34,6 @@ open class ArticleGrabberExtended(options: ReadabilityOptions, protected val reg val matchString = element.id() + " " + element.className() return regExExtended.keepImage(matchString) - } + }*/ -} \ No newline at end of file +} diff --git a/src/main/kotlin/net/dankito/readability4j/extended/processor/PostprocessorExtended.kt b/src/main/kotlin/net/dankito/readability4j/extended/processor/PostprocessorExtended.kt index 495695d..78dd946 100644 --- a/src/main/kotlin/net/dankito/readability4j/extended/processor/PostprocessorExtended.kt +++ b/src/main/kotlin/net/dankito/readability4j/extended/processor/PostprocessorExtended.kt @@ -1,33 +1,51 @@ package net.dankito.readability4j.extended.processor +import net.dankito.readability4j.model.ReadabilityOptions import net.dankito.readability4j.processor.Postprocessor import org.jsoup.nodes.Attributes -import org.jsoup.nodes.Document import org.jsoup.nodes.Element import org.jsoup.parser.Tag open class PostprocessorExtended : Postprocessor() { - override fun postProcessContent(originalDocument: Document, articleContent: Element, articleUri: String, additionalClassesToPreserve: Collection) { + override fun postProcessContent( + articleContent: Element, + baseUri: String, + documentUri: String, + options: ReadabilityOptions + ) { // call these methods before super.postProcessContent() so that afterwards relative urls are made absolute makeLazyLoadingUrlsEagerLoading(articleContent) fixAmpImageUris(articleContent) - super.postProcessContent(originalDocument, articleContent, articleUri, additionalClassesToPreserve) + super.postProcessContent(articleContent, baseUri, documentUri, options) } protected open fun makeLazyLoadingUrlsEagerLoading(articleContent: Element) { articleContent.select("img").forEach { imgElement -> - makeLazyLoadingUrlEagerLoading(imgElement, "src", - listOf("data-src", "data-original", "data-actualsrc", "data-lazy-src", "data-delayed-url", - "data-li-src", "data-pagespeed-lazy-src")) + makeLazyLoadingUrlEagerLoading( + imgElement, "src", + listOf( + "data-src", + "data-original", + "data-actualsrc", + "data-lazy-src", + "data-delayed-url", + "data-li-src", + "data-pagespeed-lazy-src" + ) + ) } } - protected open fun makeLazyLoadingUrlEagerLoading(element: Element, attributeToSet: String, lazyLoadingAttributes: List) { + private fun makeLazyLoadingUrlEagerLoading( + element: Element, + attributeToSet: String, + lazyLoadingAttributes: List + ) { lazyLoadingAttributes.forEach { lazyLoadingAttributeName -> val value = element.attr(lazyLoadingAttributeName) @@ -39,7 +57,7 @@ open class PostprocessorExtended : Postprocessor() { } } - protected open fun fixAmpImageUris(element: Element) { + private fun fixAmpImageUris(element: Element) { element.getElementsByTag("amp-img").forEach { amp_img -> if (amp_img.childNodeSize() == 0) { @@ -53,9 +71,8 @@ open class PostprocessorExtended : Postprocessor() { } } - - override fun fixRelativeUris(originalDocument: Document, element: Element, scheme: String, prePath: String, - pathBase: String) { + //now the default implementation do that but just o + /*override fun fixRelativeUris(element: Element, pathBase: String, documentUri: String) { val baseUrl = originalDocument.head().select("base").first()?.attr("href") @@ -65,6 +82,5 @@ open class PostprocessorExtended : Postprocessor() { else { super.fixRelativeUris(originalDocument, element, scheme, prePath, pathBase) } - } - -} \ No newline at end of file + }*/ +} diff --git a/src/main/kotlin/net/dankito/readability4j/extended/util/RegExUtilExtended.kt b/src/main/kotlin/net/dankito/readability4j/extended/util/BaseRegexUtilExtended.kt similarity index 88% rename from src/main/kotlin/net/dankito/readability4j/extended/util/RegExUtilExtended.kt rename to src/main/kotlin/net/dankito/readability4j/extended/util/BaseRegexUtilExtended.kt index ac85bd8..23cf6d2 100644 --- a/src/main/kotlin/net/dankito/readability4j/extended/util/RegExUtilExtended.kt +++ b/src/main/kotlin/net/dankito/readability4j/extended/util/BaseRegexUtilExtended.kt @@ -1,10 +1,10 @@ package net.dankito.readability4j.extended.util -import net.dankito.readability4j.util.RegExUtil +import net.dankito.readability4j.util.BaseRegexUtil import java.util.regex.Pattern -open class RegExUtilExtended : RegExUtil { +open class BaseRegexUtilExtended : BaseRegexUtil { companion object { const val RemoveImageDefaultPattern = "author|avatar|thumbnail" // CHANGE: this is not in Mozilla's Readability @@ -15,7 +15,7 @@ open class RegExUtilExtended : RegExUtil { protected val removeImage: Pattern - + @JvmOverloads constructor(unlikelyCandidatesPattern: String = UnlikelyCandidatesDefaultPattern, okMaybeItsACandidatePattern: String = OkMaybeItsACandidateDefaultPattern, positivePattern: String = PositiveDefaultPattern, negativePattern: String = NegativeDefaultPattern + NegativeDefaultPatternExtended, extraneousPattern: String = ExtraneousDefaultPattern, bylinePattern: String = BylineDefaultPattern, @@ -30,11 +30,11 @@ open class RegExUtilExtended : RegExUtil { open fun keepImage(matchString: String): Boolean { // CHANGE: this is not in Mozilla's Readability - if((isNegative(matchString) && isPositive(matchString) == false) || removeImage.matcher(matchString).find()) { + if((isNegative(matchString) && !isPositive(matchString)) || removeImage.matcher(matchString).find()) { return false } return true } -} \ No newline at end of file +} diff --git a/src/main/kotlin/net/dankito/readability4j/model/ArticleGrabberOptions.kt b/src/main/kotlin/net/dankito/readability4j/model/ArticleGrabberOptions.kt index e431e05..52752e9 100644 --- a/src/main/kotlin/net/dankito/readability4j/model/ArticleGrabberOptions.kt +++ b/src/main/kotlin/net/dankito/readability4j/model/ArticleGrabberOptions.kt @@ -1,6 +1,12 @@ package net.dankito.readability4j.model - +/** + * This class represents the flags + * FLAG_STRIP_UNLIKELYS, + * FLAG_WEIGHT_CLASSES, + * FLAG_CLEAN_CONDITIONALLY + * on Readability.js + * */ open class ArticleGrabberOptions(var stripUnlikelyCandidates: Boolean = true, var weightClasses: Boolean = true, var cleanConditionally: Boolean = true) diff --git a/src/main/kotlin/net/dankito/readability4j/model/ArticleMetadata.kt b/src/main/kotlin/net/dankito/readability4j/model/ArticleMetadata.kt index 4731c33..c67069b 100644 --- a/src/main/kotlin/net/dankito/readability4j/model/ArticleMetadata.kt +++ b/src/main/kotlin/net/dankito/readability4j/model/ArticleMetadata.kt @@ -1,4 +1,48 @@ package net.dankito.readability4j.model -open class ArticleMetadata(var title: String? = null, var byline: String? = null, var excerpt: String? = null, var dir: String? = null, var charset: String? = null) \ No newline at end of file + +open class ArticleMetadata() //As the class itself its always called without any args removed them +{ + + /** + * Just for retrocompatibility + * + * */ + //but saved old constructor + constructor(title: String?=null, + byline: String?=null, + excerpt: String?=null, + dir: String?=null, + charset: String?=null) : this() { + this.title=title + this.byline=byline + this.excerpt=excerpt + this.dir=dir + this.charset=charset + } + + var title: String? = null + var byline: String? = null + var excerpt: String? = null + var siteName:String? = null + var publishedTime:String? = null + + //this is text direction that in + @Deprecated("This is always gotten from the ArticleGrabber Object," + + "don't use this one except for testing") + var dir :String?=null + @Deprecated("This is always utf-8 right now") + var charset: String? = "utf-8" + + override fun toString() = buildString { + arrayOf(title).joinToString() + } + + //JSONLDCompatibility + var datePublished:String? get(){ + return publishedTime + } set(value){ + publishedTime=value + } +} diff --git a/src/main/kotlin/net/dankito/readability4j/model/ReadabilityObject.kt b/src/main/kotlin/net/dankito/readability4j/model/ReadabilityObject.kt index 18ae24b..3b2d759 100644 --- a/src/main/kotlin/net/dankito/readability4j/model/ReadabilityObject.kt +++ b/src/main/kotlin/net/dankito/readability4j/model/ReadabilityObject.kt @@ -1,4 +1,6 @@ package net.dankito.readability4j.model - -open class ReadabilityObject(var contentScore: Double) \ No newline at end of file +/** + * This class is a dummy one for maintaining the porting easier as copy and paste + * */ +open class ReadabilityObject(var contentScore: Double) diff --git a/src/main/kotlin/net/dankito/readability4j/model/ReadabilityOptions.kt b/src/main/kotlin/net/dankito/readability4j/model/ReadabilityOptions.kt index e3b3e55..1d3ba40 100644 --- a/src/main/kotlin/net/dankito/readability4j/model/ReadabilityOptions.kt +++ b/src/main/kotlin/net/dankito/readability4j/model/ReadabilityOptions.kt @@ -1,10 +1,27 @@ package net.dankito.readability4j.model +import net.dankito.readability4j.util.BaseRegexUtil -open class ReadabilityOptions(val maxElemsToParse: Int = DEFAULT_MAX_ELEMS_TO_PARSE, + +open class ReadabilityOptions +@JvmOverloads +constructor(val maxElemsToParse: Int = DEFAULT_MAX_ELEMS_TO_PARSE, val nbTopCandidates: Int = DEFAULT_N_TOP_CANDIDATES, - val wordThreshold: Int = DEFAULT_WORD_THRESHOLD, - val additionalClassesToPreserve: Collection = ArrayList()) { + val charThreshold: Int = DEFAULT_CHAR_THRESHOLD, + //changed to set as readability as you shouldn't have duplicates here + val additionalClassesToPreserve: Set = setOf(), + val allowedVideoRegex: Regex=Regex(BaseRegexUtil.VideosDefaultPattern), + val linkDensityModifier: Double=0.0, + val disableJSONLD:Boolean = false , + val keepClasses: Boolean = false , + val debug:Boolean = false) { + + @Deprecated("", + replaceWith = ReplaceWith("charThreshold"), + level = DeprecationLevel.WARNING) + val wordThreshold:Int get() { + return charThreshold + } companion object { // Max number of nodes supported by this parser. Default: 0 (no limit) @@ -15,7 +32,12 @@ open class ReadabilityOptions(val maxElemsToParse: Int = DEFAULT_MAX_ELEMS_TO_PA const val DEFAULT_N_TOP_CANDIDATES = 5 // The default number of words an article must have in order to return a result + const val DEFAULT_CHAR_THRESHOLD = 500 + + @Deprecated("Changed to DEFAULT_CHAR_THRESHOLD", + replaceWith = ReplaceWith("DEFAULT_CHAR_THRESHOLD"), + level = DeprecationLevel.WARNING) const val DEFAULT_WORD_THRESHOLD = 500 } -} \ No newline at end of file +} diff --git a/src/main/kotlin/net/dankito/readability4j/processor/ArticleGrabber.kt b/src/main/kotlin/net/dankito/readability4j/processor/ArticleGrabber.kt index ae329f5..fca00ab 100644 --- a/src/main/kotlin/net/dankito/readability4j/processor/ArticleGrabber.kt +++ b/src/main/kotlin/net/dankito/readability4j/processor/ArticleGrabber.kt @@ -4,41 +4,44 @@ import net.dankito.readability4j.model.ArticleGrabberOptions import net.dankito.readability4j.model.ArticleMetadata import net.dankito.readability4j.model.ReadabilityObject import net.dankito.readability4j.model.ReadabilityOptions -import net.dankito.readability4j.util.RegExUtil +import net.dankito.readability4j.util.BaseRegexUtil +import net.dankito.readability4j.util.log +import net.dankito.readability4j.util.logDebug import org.jsoup.nodes.Document import org.jsoup.nodes.Element -import org.jsoup.nodes.TextNode -import org.jsoup.select.Elements import org.slf4j.LoggerFactory -import java.util.* -import kotlin.collections.ArrayList -import kotlin.collections.HashMap +import kotlin.math.floor +import kotlin.math.max +import kotlin.math.min -open class ArticleGrabber(protected val options: ReadabilityOptions, protected val regEx: RegExUtil = RegExUtil()) : ProcessorBase() { +open class ArticleGrabber(options: ReadabilityOptions, override val regex: BaseRegexUtil = BaseRegexUtil()) : ProcessorBase() { companion object { // Element tags to score by default. - val DEFAULT_TAGS_TO_SCORE = Arrays.asList("section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre") - - - val DIV_TO_P_ELEMS = Arrays.asList("a", "blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul", "select") - - val ALTER_TO_DIV_EXCEPTIONS = Arrays.asList("div", "article", "section", "p") - - val PRESENTATIONAL_ATTRIBUTES = Arrays.asList("align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace") - - val DEPRECATED_SIZE_ATTRIBUTE_ELEMS = Arrays.asList("table", "th", "td", "hr", "pre") - - val EMBEDDED_NODES = Arrays.asList("object", "embed", "iframe") - - val DATA_TABLE_DESCENDANTS = Arrays.asList("col", "colgroup", "tfoot", "thead", "th") - - + val DEFAULT_TAGS_TO_SCORE = listOf("section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre") + + val DIV_TO_P_ELEMS = listOf("blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul") + val UNLIKELY_ROLES = listOf("menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog") + val ALTER_TO_DIV_EXCEPTIONS = listOf("div", "article", "section", "p", "ol", "ul") + val PRESENTATIONAL_ATTRIBUTES = listOf("align", "background", "bgcolor", "border", + "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", + "vspace") + val DEPRECATED_SIZE_ATTRIBUTE_ELEMS = listOf("table", "th", "td", "hr", "pre") + // The commented out elements qualify as phrasing content but tend to be + // removed by readability when put into paragraphs, so we ignore them here. + // "CANVAS", "IFRAME", "SVG", "VIDEO", + val PHRASING_ELEMS = listOf("abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em", "embed", "i", "img", "input", "kbd", "label", "mark", "math", "meter", "noscript", "object", "output", "progress", "q", "ruby", "samp", "script", "select", "small", "span", "strong", "sub", "sup", "textarea", "time", "var", "wbr") private val log = LoggerFactory.getLogger(ArticleGrabber::class.java) } + var articleLang: String?= null + protected set + + var articleTitle: String? = null + private set + var articleByline: String? = null protected set @@ -46,33 +49,41 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v protected set - protected val nbTopCandidates = options.nbTopCandidates - protected val wordThreshold = options.wordThreshold + private val nbTopCandidates = options.nbTopCandidates + private val charThreshold = options.charThreshold - protected val readabilityObjects = HashMap() + private val readabilityObjects = HashMap() - protected val readabilityDataTable = HashMap() + private val readabilityDataTable = HashMap() + //changed to global because this class is always reinstated + private val options: ArticleGrabberOptions = ArticleGrabberOptions() + //changed to global because inside while is always reinstated and has no sense + private val attempts = arrayListOf>() - open fun grabArticle(doc: Document, metadata: ArticleMetadata, options: ArticleGrabberOptions = ArticleGrabberOptions(), pageElement: Element? = null): Element? { - log.debug("**** grabArticle ****") + open fun grabArticle(doc: Document, metadata: ArticleMetadata, pageElement: Element? = null): Element? { + log.info("**** grabArticle ****") val isPaging = pageElement != null val page = pageElement ?: doc.body() + articleTitle=metadata.title + articleByline=metadata.byline + // We can't grab an article if we don't have a page! if(page == null) { - log.debug("No body found in document. Abort.") + log.info("No body found in document. Abort.") return null } - val pageCacheHtml = doc.html() + val pageCacheHtml = page.html() while(true) { + log.info("Starting grabArticle loop") // First, node prepping. Trash nodes that look cruddy (like ones with the // class name "comment", etc), and turn divs into P tags where they have been // used inappropriately (as in, where they contain no other block level elements.) - val elementsToScore = prepareNodes(doc, options) + val elementsToScore = prepareNodes(doc) /** * Loop through all paragraphs, and assign a score to them based on how content-y they look. @@ -80,11 +91,12 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v * * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. **/ - val candidates = scoreElements(elementsToScore, options) + //cnn testcase problem starts here + val candidates = scoreElements(elementsToScore) // After we've calculated scores, loop through all of the possible // candidate nodes we found and find the one with the highest score. - val topCandidateResult = getTopCandidate(page, candidates, options) + val topCandidateResult = getTopCandidate(page, candidates) val topCandidate = topCandidateResult.first val neededToCreateTopCandidate= topCandidateResult.second @@ -94,63 +106,57 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v var articleContent = createArticleContent(doc, topCandidate, isPaging) - log.debug("Article content pre-prep: {}", articleContent.html()) + log.info("Article content pre-prep: {}", articleContent.html().logDebug()) // So we have all of the content that we need. Now we clean it up for presentation. - prepArticle(articleContent, options, metadata) - log.debug("Article content post-prep: {}", articleContent.html()) + // ok all the bugs left are here or that looks like + prepArticle(articleContent) + log.info("Article content post-prep: {}", articleContent.html().logDebug()) if(neededToCreateTopCandidate) { // We already created a fake div thing, and there wouldn't have been any siblings left // for the previous loop, so there's no point trying to create a new div, and then // move all the children over. Just assign IDs and class names here. No need to append // because that already happened anyway. - topCandidate.attr("id", "readability-page-1") - topCandidate.addClass("page") + topCandidate.id( "readability-page-1") + topCandidate.classNames(setOf("page")) } else { val div = doc.createElement("div") - div.attr("id", "readability-page-1") - div.addClass("page") + div.id("readability-page-1") + div.classNames(setOf("page")) - ArrayList(articleContent.childNodes()).forEach { child -> - child.remove() - div.appendChild(child) - } + div.appendChildren(articleContent.childNodes()) articleContent.appendChild(div) } - log.debug("Article content after paging: {}", articleContent.html()) + log.info("Article content after paging: {}", articleContent.html().logDebug()) var parseSuccessful = true - val attempts = ArrayList>() // Now that we've gone through the full algorithm, check to see if // we got any meaningful content. If we didn't, we may need to re-run // grabArticle with different flags set. This gives us a higher likelihood of // finding the content, and the sieve approach gives us a higher likelihood of // finding the -right- content. - val textLength = getInnerText(articleContent, regEx, true).length - if(textLength < this.wordThreshold) { + val textLength = getInnerText(articleContent, true).length + if(textLength < this.charThreshold) { parseSuccessful = false page.html(pageCacheHtml) + attempts.add(Pair(articleContent, textLength)) if(options.stripUnlikelyCandidates) { options.stripUnlikelyCandidates = false - attempts.add(Pair(articleContent, textLength)) } else if(options.weightClasses) { options.weightClasses = false - attempts.add(Pair(articleContent, textLength)) } else if(options.cleanConditionally) { options.cleanConditionally = false - attempts.add(Pair(articleContent, textLength)) } else { - attempts.add(Pair(articleContent, textLength)) // No luck after removing flags, just return the longest text we found during the different loops - attempts.sortBy { it.second } + attempts.sortByDescending { it.second } // But first check if we actually have something if (attempts.isEmpty() || attempts[0].second <= 0) { @@ -174,50 +180,154 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v /* First step: prepare nodes */ - protected open fun prepareNodes(doc: Document, options: ArticleGrabberOptions): List { + private fun prepareNodes(doc: Document): List { val elementsToScore = ArrayList() var node: Element? = doc + var shouldRemoveTitleHeader = true while(node != null) { + if (node.tagName() == "html"){ + if (node.hasAttr("lang")) { + articleLang = node.attr("lang") + } + } + val matchString = node.className() + " " + node.id() - // Check to see if this node is a byline, and remove it if it is. - if(checkByline(node, matchString)) { - node = removeAndGetNext(node, "byline") + // Check if node is visible or no (who knows if you don't have the full page) + if(!isProbablyVisible(node)) { + log.info("Removing hidden node {}", matchString) + node = removeAndGetNext(node) + continue + } + + // User is not able to see elements applied with both "aria-modal = true" and "role = dialog" + if ( + node.attr("aria-modal") == "true" && + node.attr("role") == "dialog" + ) { + node = removeAndGetNext(node) + continue + } + + // If we don't have a byline yet check to see if this node is a byline; + // if it is store the byline and remove the node. + if(articleByline == null && isValidByline(node, matchString)) { + + // Find child node matching [itemprop="name"] and use that if it exists for a more accurate author name byline + val endOfSearchMarkerNode = getNextNode(node, true) + var itemPropNameNode:Element? = null + var next = getNextNode(node) + while (next!=null && next != endOfSearchMarkerNode) { + val itemprop = next.attr("itemprop") + if (itemprop.isNotEmpty() && itemprop.contains("name")) { + itemPropNameNode = next + break + } else { + next = getNextNode(next) + } + } + articleByline = (itemPropNameNode?:node).text().trim() + node = removeAndGetNext(node) + continue + } + + if ( shouldRemoveTitleHeader && headerDuplicatesTitle(node)) { + log.info( + "Removing header: {} {}", + node.text().trim(), + this.articleTitle?.trim() + ) + shouldRemoveTitleHeader = false + node = removeAndGetNext(node) continue } // Remove unlikely candidates if(options.stripUnlikelyCandidates) { - if(regEx.isUnlikelyCandidate(matchString) && - regEx.okMaybeItsACandidate(matchString) == false && - node.tagName() != "body" && - node.tagName() != "a") { - node = this.removeAndGetNext(node, "Removing unlikely candidate") + if(regex.isUnlikelyCandidate(matchString) && + !regex.okMaybeItsACandidate(matchString) && + !hasAncestorTag(node, "table") && + !hasAncestorTag(node, "code") && + node.tagName() !in listOf("body", "a")) { + log.info("Removing unlikely candidate - {}", matchString) + + node = this.removeAndGetNext(node) + continue + } + + if (node.attr("role") in UNLIKELY_ROLES){ + log.info( + "Removing content with role {}{}{}", + node.attr("role"), + " - ", + matchString + ) + node = removeAndGetNext(node) continue } } + // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). - if((node.tagName() == "div" || node.tagName() == "section" || node.tagName() == "header" || - node.tagName() == "h1" || node.tagName() == "h2" || node.tagName() == "h3" || - node.tagName() == "h4" || node.tagName() == "h5" || node.tagName() == "h6") && - this.isElementWithoutContent(node)) { - node = this.removeAndGetNext(node, "node without content") + if((node.tagName() in + listOf("div","section","header","h1","h2","h3","h4","h5","h6")) + && this.isElementWithoutContent(node)) { + node = this.removeAndGetNext(node) continue } - if(DEFAULT_TAGS_TO_SCORE.contains(node.tagName())) { + if(node.tagName() in DEFAULT_TAGS_TO_SCORE) { elementsToScore.add(node) } // Turn all divs that don't have children block level elements into p's if(node.tagName() == "div") { + // Put phrasing content into paragraphs. + var childNode = node.firstChild() + while (childNode!=null) { + var nextSibling = childNode.nextSibling() + if (isPhrasingContent(childNode)) { + val fragment=Element("div") + // Collect all consecutive phrasing content into a fragment. + do { + nextSibling = childNode!!.nextSibling() + fragment.appendChild(childNode); + childNode = nextSibling; + } while (childNode!=null && isPhrasingContent(childNode)) + + // Trim leading and trailing whitespace from the fragment. + while ( + fragment.firstChild()!=null && + isWhitespace(fragment.firstChild()!!) + ) { + fragment.firstChild()?.remove(); + } + var lastChild = fragment.lastChild() + while ( + fragment.lastChild()!=null && + isWhitespace(fragment.lastChild()!!) + ) { + fragment.lastChild()?.remove(); + } + + // If the fragment contains anything, wrap it in a paragraph and + // insert it before the next non-phrasing node. + if (!regex.isWhitespace(fragment.html())) { + val p = doc.createElement("p"); + p.appendChildren(fragment.childNodes()); + node.insertChildren(nextSibling?.siblingIndex()?:-1,p) + } + } + childNode = nextSibling + } + // Sites like http://mobile.slate.com encloses each paragraph with a DIV // element. DIVs with only a P element inside and no text content can be // safely converted into plain P elements to avoid confusing the scoring // algorithm with DIVs with are, in practice, paragraphs. - if(this.hasSinglePInsideElement(node)) { + if(hasSingleTagInsideElement(node,"p") && + getLinkDensity(node) < 0.25) { val newNode = node.child(0) node.replaceWith(newNode) node = newNode @@ -227,161 +337,141 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v setNodeTag(node, "p") elementsToScore.add(node) } - else { - // EXPERIMENTAL - node.childNodes().forEach { childNode -> - if(childNode is TextNode && childNode.text().trim().length > 0) { - val p = doc.createElement("p") - p.text(childNode.text()) - p.attr("style", "display: inline;") - p.addClass("readability-styled") - childNode.replaceWith(p) - } - } - } } - - node = if(node != null) this.getNextNode(node) else null + node = this.getNextNode(node) } return elementsToScore } - protected open fun checkByline(node: Element, matchString: String): Boolean { - if(this.articleByline != null) { + + private fun headerDuplicatesTitle(node: Element): Boolean { + if (articleTitle==null) + return false + if (!node.tagName().equals("H1",ignoreCase = true) && + !node.tagName().equals("H2",ignoreCase = true)) { return false } + val heading = node.text() + log.info("Evaluating similarity of header: {} {}", heading, articleTitle) + return this.textSimilarity(articleTitle!!, heading) > 0.75 + } - val rel = node.attr("rel") - if((rel == "author" || regEx.isByline(matchString)) && isValidByline(node.wholeText())) { - this.articleByline = node.text().trim() - return true - } - return false + private fun isProbablyVisible(node: Element): Boolean { + return(!node.hasAttr("style") || + !node.attr("style") + .contains(Regex("(display(\\s*)?:(\\s*)?none)|(visibility(\\s*)?:(\\s*)?hidden)"))) && + !node.hasAttr("hidden") && + (!node.hasAttr("aria-hidden") || + (!node.attr("aria-hidden").contains("true") && node.attr("aria-hidden").isNotBlank()) || + (node.className().isNotEmpty() && + node.className().contains("fallback-image"))) } + +// protected open fun checkAndSaveByline(node: Element, matchString: String): Boolean { +// +// return true +// } + /** * Check whether the input string could be a byline. * This verifies that the input is a string, and that the length * is less than 100 chars. */ - protected open fun isValidByline(text: String): Boolean { - val byline = text.trim() - - return (byline.isNotEmpty()) && (byline.length < 100) - } - + private fun isValidByline(node: Element, matchString: String): Boolean { + val rel = node.attr("rel") + val itemprop = node.attr("itemprop") + val bylineLength = node.wholeText().trim().length - protected open fun isElementWithoutContent(node: Element): Boolean { - return node.text().isBlank() && - (node.children().size == 0 || - node.children().size == node.getElementsByTag("br").size + node.getElementsByTag("hr").size) + return ((rel == "author" || itemprop.contains("author") + || regex.isByline(matchString)) && bylineLength in 1 until 100) } - /** - * Check if this node has only whitespace and a single P element - * Returns false if the DIV node contains non-empty text nodes - * or if it contains no P or more than 1 element. - */ - protected open fun hasSinglePInsideElement(element: Element): Boolean { - // There should be exactly 1 element child which is a P: - if(element.children().size != 1 || element.child(0).tagName() != "p") { - return false - } - - // And there should be no text nodes with real content - element.childNodes().forEach { node -> - if(node is TextNode && regEx.hasContent(node.text())) { - return false - } - } - - return true - } - /** * Determine whether element has any children block level elements. */ - protected open fun hasChildBlockElement(element: Element): Boolean { - element.children().forEach { node -> - if(DIV_TO_P_ELEMS.contains(node.tagName()) || hasChildBlockElement(node)) { - return true - } + private fun hasChildBlockElement(element: Element): Boolean { + return element.children().any { node -> + node.tagName() in DIV_TO_P_ELEMS || hasChildBlockElement(node) } - - return false } - protected open fun setNodeTag(node: Element, tagName: String) { + private fun setNodeTag(node: Element, tagName: String): Element { + log.info("setNodeTag {} {}", node.log(), tagName) node.tagName(tagName) + return node } - /* Second step: Score elements */ - protected open fun scoreElements(elementsToScore: List, options: ArticleGrabberOptions): List { + private fun scoreElements(elementsToScore: List): List { val candidates = ArrayList() - + val candidateLog = false elementsToScore.forEach { elementToScore -> if(elementToScore.parentNode() == null) { return@forEach } // If this paragraph is less than 25 characters, don't even count it. - val innerText = this.getInnerText(elementToScore, regEx) + val innerText = this.getInnerText(elementToScore) if(innerText.length < 25) { return@forEach } // Exclude nodes with no ancestor. - val ancestors = this.getNodeAncestors(elementToScore, 3) - if(ancestors.size == 0) { + val ancestors = this.getNodeAncestors(elementToScore, 5) + if(ancestors.isEmpty()) { return@forEach } - var contentScore = 0.0 + var contentScore:Double = 0.0 // Add a point for the paragraph itself as a base. contentScore += 1 // Add points for any commas within this paragraph. - contentScore += innerText.split(',').size + contentScore += regex.splitCommas(innerText).size // For every 100 characters in this paragraph, add another point. Up to 3 points. - contentScore += Math.min(Math.floor(innerText.length / 100.0), 3.0) + contentScore += min(floor(innerText.length / 100.0),3.0) // Initialize and score ancestors. - for(level in 0..ancestors.size - 1) { - val ancestor = ancestors[level] - if(ancestor.tagName().isNullOrBlank()) { // with Jsoup this should never be true as we're only handling Elements - return@forEach + for((level,ancestor) in ancestors.withIndex()) { + if(ancestor.tagName().isNullOrBlank() && ancestor.parentNode() !=null) { + break + } + + if(ancestor is Document|| ancestor.normalName()=="html") { + break + } + + if (ancestor.id()=="storycontent" && !candidateLog){ + ancestors.withIndex().forEach { log.info("ancestor level {} {}", it.index, it.value.log()) } } - if(getReadabilityObject(ancestor) == null) { + if(ancestor.readability == null) { + initializeNode(ancestor) candidates.add(ancestor) - initializeNode(ancestor, options) } // Node score divider: // - parent: 1 (no division) // - grandparent: 2 // - great grandparent+: ancestor level * 3 - val scoreDivider = - if(level == 0) - 1 - else if(level == 1) - 2 - else - level * 3 - - getReadabilityObject(ancestor)?.let { readability -> - readability.contentScore += contentScore / scoreDivider.toDouble() - } + val scoreDivider:Double = + when (level) { + 0 -> 1.0 + 1 -> 2.0 + else -> level * 3.0 + } + + ancestor.readability!!.contentScore += contentScore / scoreDivider } } @@ -392,9 +482,8 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v * Initialize a node with the readability object. Also checks the * className/id for special names to add to its score. */ - protected open fun initializeNode(node: Element, options: ArticleGrabberOptions): ReadabilityObject { + private fun initializeNode(node: Element) { val readability = ReadabilityObject(0.0) - readabilityObjects.put(node, readability) when(node.tagName()) { "div" -> @@ -425,17 +514,17 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v readability.contentScore -= 5 } - readability.contentScore += getClassWeight(node, options) + readability.contentScore += getClassWeight(node) - return readability + node.readability = readability } /** * Get an elements class/id weight. Uses regular expressions to tell if this * element looks good or bad. */ - protected open fun getClassWeight(e: Element, options: ArticleGrabberOptions): Int { - if(options.weightClasses == false) { + private fun getClassWeight(e: Element): Int { + if(!options.weightClasses) { return 0 } @@ -443,22 +532,22 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v // Look for a special classname if(e.className().isNotBlank()) { - if(regEx.isNegative(e.className())) { + if(regex.isNegative(e.className())) { weight -= 25 } - if(regEx.isPositive(e.className())) { + if(regex.isPositive(e.className())) { weight += 25 } } // Look for a special ID if(e.id().isNotBlank()) { - if(regEx.isNegative(e.id())) { + if(regex.isNegative(e.id())) { weight -= 25 } - if(regEx.isPositive(e.id())) { + if(regex.isPositive(e.id())) { weight += 25 } } @@ -466,18 +555,18 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v return weight } - protected open fun getNodeAncestors(node: Element, maxDepth: Int = 0): List { + private fun getNodeAncestors(node: Element, maxDepth: Int = 0): List { var i = 0 val ancestors = ArrayList() var next = node while(next.parent() != null) { - ancestors.add(next.parent()) - if(++i == maxDepth) { + val parentNode = next.parent()!! + ancestors.add(parentNode) + if((maxDepth>=1 && ++i == maxDepth)) { break } - - next = next.parent() + next = parentNode } return ancestors @@ -487,26 +576,28 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v /* Third step: Get top candidate */ - protected open fun getTopCandidate(page: Element, candidates: List, options: ArticleGrabberOptions): Pair { + private fun getTopCandidate(page: Element, candidates: List): Pair { val topCandidates = ArrayList() candidates.forEach { candidate -> - getReadabilityObject(candidate)?.let { readability -> + candidate.readability?.let { readability -> // Scale the final candidates score based on link density. Good content // should have a relatively small link density (5% or less) and be mostly // unaffected by this operation. - val candidateScore = readability.contentScore * (1 - this.getLinkDensity(candidate)) - readability.contentScore = candidateScore + val ld =this.getLinkDensity(candidate) +// log.info("Before ld score: {}",candidate.readability?.contentScore) + + val candidateScore = readability.contentScore * (1 - ld) + candidate.readability?.contentScore = candidateScore - log.debug("Candidate: {} with score {}", candidate, candidateScore) + log.info("Candidate:\",\"{}\",\"with score {}\"]", candidate.log(), candidateScore) - for(t in 0..nbTopCandidates - 1) { + for(t in 0.. t) topCandidates[t] else null - val topCandidateReadability = if(aTopCandidate != null) getReadabilityObject(aTopCandidate) else null + val topCandidateReadability = aTopCandidate?.readability if(aTopCandidate == null || (topCandidateReadability != null && candidateScore > topCandidateReadability.contentScore)) { topCandidates.add(t, candidate) - if(topCandidates.size > this.nbTopCandidates) { topCandidates.removeAt(nbTopCandidates) } @@ -526,15 +617,18 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v topCandidate = Element("div") // Move everything (not just elements, also text nodes etc.) into the container // so we even include text directly in the body: - ArrayList(page.childNodes()).forEach { child -> - log.debug("Moving child out: {}", child) - child.remove() - topCandidate?.appendChild(child) + while(page.firstChild()!=null){ +// if (child is Comment){ +// //javascript ignores it +// return@forEach +// } + log.info("Moving child out: {}", page.firstChild()?.log()) + page.firstChild()?.let { topCandidate!!.appendChild(it) } } page.appendChild(topCandidate) - this.initializeNode(topCandidate, options) + initializeNode(topCandidate) return Pair(topCandidate, true) } @@ -543,24 +637,34 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v // and whose scores are quite closed with current `topCandidate` node. val alternativeCandidateAncestors = ArrayList>() - getReadabilityObject(topCandidate)?.let { topCandidateReadability -> - topCandidates.filter { it != topCandidate }.forEach { otherTopCandidate -> - if(((getReadabilityObject(otherTopCandidate)?.contentScore ?: 0.0) / topCandidateReadability.contentScore) >= 0.75) { - alternativeCandidateAncestors.add(this.getNodeAncestors(otherTopCandidate)) + for (otherTopCandidate in topCandidates.filter { it != topCandidate }) { + topCandidate.readability?.let { topCandidateReadability -> + otherTopCandidate.readability?.let {otherTopCandidateReadability-> + if (((otherTopCandidateReadability.contentScore) / + topCandidateReadability.contentScore) >= 0.75 + ) { + alternativeCandidateAncestors.add( + this.getNodeAncestors( + otherTopCandidate + ) + ) + } } } } - val MINIMUM_TOPCANDIDATES = 3 + if(alternativeCandidateAncestors.size >= MINIMUM_TOPCANDIDATES) { parentOfTopCandidate = topCandidate.parent() - - while(parentOfTopCandidate != null && parentOfTopCandidate.tagName() !== "body") { + while(parentOfTopCandidate != null && parentOfTopCandidate.tagName() != "body") { var listsContainingThisAncestor = 0 var ancestorIndex = 0 - while(ancestorIndex < alternativeCandidateAncestors.size && listsContainingThisAncestor < MINIMUM_TOPCANDIDATES) { - if(alternativeCandidateAncestors[ancestorIndex].contains(parentOfTopCandidate)) { + while(ancestorIndex < alternativeCandidateAncestors.size && + listsContainingThisAncestor < MINIMUM_TOPCANDIDATES) { + + if(alternativeCandidateAncestors[ancestorIndex]. + contains(parentOfTopCandidate)) { listsContainingThisAncestor++ } ancestorIndex++ @@ -574,9 +678,9 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v } } - topCandidate = topCandidate!! - if(getReadabilityObject(topCandidate) == null) { - this.initializeNode(topCandidate, options) + + if(topCandidate!!.readability == null) { + initializeNode(topCandidate) } // Because of our bonus system, parents of candidates might have scores @@ -587,12 +691,12 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v // below does some of that - but only if we've looked high enough up the DOM // tree. parentOfTopCandidate = topCandidate.parent() - var lastScore = getReadabilityObject(topCandidate)?.contentScore ?: 0.0 + var lastScore = topCandidate.readability?.contentScore!! // The scores shouldn't get too low. val scoreThreshold = lastScore / 3.0 while(parentOfTopCandidate != null && parentOfTopCandidate.tagName() != "body") { - val parentOfTopCandidateReadability = getReadabilityObject(parentOfTopCandidate) + val parentOfTopCandidateReadability = parentOfTopCandidate.readability if(parentOfTopCandidateReadability == null) { parentOfTopCandidate = parentOfTopCandidate.parent() continue @@ -614,16 +718,14 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v // If the top candidate is the only child, use parent instead. This will help sibling // joining logic when adjacent content is actually located in parent's sibling node. - topCandidate = topCandidate!! - parentOfTopCandidate = topCandidate.parent() + parentOfTopCandidate = topCandidate!!.parent() while(parentOfTopCandidate != null && parentOfTopCandidate.tagName() != "body" && parentOfTopCandidate.children().size == 1) { topCandidate = parentOfTopCandidate parentOfTopCandidate = topCandidate.parent() } - topCandidate = topCandidate!! - if(getReadabilityObject(topCandidate) == null) { - this.initializeNode(topCandidate, options) + if(topCandidate!!.readability == null) { + initializeNode(topCandidate) } return Pair(topCandidate, false) @@ -634,17 +736,19 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v * Get the density of links as a percentage of the content * This is the amount of text that is inside a link divided by the total text in the node. */ - protected open fun getLinkDensity(element: Element): Double { - val textLength = this.getInnerText(element, regEx).length + private fun getLinkDensity(element: Element): Double { + val textLength = this.getInnerText(element).length if(textLength == 0) { return 0.0 } - var linkLength = 0 + var linkLength = 0.0 // XXX implement _reduceNodeList? element.getElementsByTag("a").forEach { linkNode -> - linkLength += this.getInnerText(linkNode, regEx).length + val href = linkNode.attr("href") + val coefficient = if (href.isNotBlank() && regex.isHashUrl(href)) 0.3 else 1.0 + linkLength += this.getInnerText(linkNode).length.toDouble() * coefficient } return linkLength / textLength.toDouble() @@ -654,77 +758,89 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v /* Forth step: Create articleContent */ - protected open fun createArticleContent(doc: Document, topCandidate: Element, isPaging: Boolean): Element { + private fun createArticleContent(doc: Document, topCandidate: Element, isPaging: Boolean): Element { val articleContent = doc.createElement("div") if(isPaging) { articleContent.attr("id", "readability-content") } - val topCandidateReadability = getReadabilityObject(topCandidate) - if(topCandidateReadability == null) { - return articleContent - } + val topCandidateReadability = topCandidate.readability ?: return articleContent - val siblingScoreThreshold = Math.max(10.0, topCandidateReadability.contentScore * 0.2) + val siblingScoreThreshold = max(10.0, topCandidateReadability.contentScore * 0.2) // Keep potential top candidate's parent node to try to get text direction of it later. - val parentOfTopCandidate = topCandidate.parent() // parentOfTopCandidate may is null, see issue #12 - val siblings = parentOfTopCandidate?.children() ?: Elements() - - ArrayList(siblings).forEach { sibling -> // make a copy of children as the may get modified below -> we can get rid of s -= 1 sl -= 1 compared to original source - var append = false - - val siblingReadability = getReadabilityObject(sibling) - log.debug("Looking at sibling node: {} with score {}", sibling, siblingReadability?.contentScore ?: 0) - log.debug("Sibling has score {}", siblingReadability?.contentScore?.toString() ?: "Unknown") - - if(sibling == topCandidate) { - append = true - } - else { - var contentBonus = 0.0 - - // Give a bonus if sibling nodes and top candidates have the example same classname - if(sibling.className() == topCandidate.className() && topCandidate.className() !== "") - contentBonus += topCandidateReadability.contentScore * 0.2 - - if(siblingReadability != null && - ((siblingReadability.contentScore + contentBonus) >= siblingScoreThreshold)) { + // parentOfTopCandidate may is null, see issue #12 + topCandidate.parent()?.let {parentOfTopCandidate-> + val siblings = parentOfTopCandidate.children() + + ArrayList(siblings).forEach { sibling -> // make a copy of children as the may get modified below -> we can get rid of s -= 1 sl -= 1 compared to original source + // make a copy of children as the may get modified below -> we can get rid of s -= 1 sl -= 1 compared to original source + val siblingReadability = sibling.readability + var append = false + + log.info( + "Looking at sibling node: {} with score {}", + sibling.log(), + siblingReadability?.contentScore ?: "Unknown" + ) + + log.info( + "Sibling has score {}", + siblingReadability?.contentScore ?: "Unknown" + ) + + if (sibling == topCandidate) { append = true } - else if(shouldKeepSibling(sibling)) { - val linkDensity = this.getLinkDensity(sibling) - val nodeContent = this.getInnerText(sibling, regEx) - val nodeLength = nodeContent.length + else { + var contentBonus = 0.0 - if(nodeLength > 80 && linkDensity < 0.25) { - append = true + // Give a bonus if sibling nodes and top candidates have the example same classname + if (sibling.className() == topCandidate.className() && + topCandidate.className() != "") { + contentBonus += topCandidateReadability.contentScore * 0.2 } - else if(nodeLength < 80 && nodeLength > 0 && linkDensity == 0.0 && - nodeContent.contains("\\.( |$)".toRegex())) { + + if (siblingReadability != null && + ((siblingReadability.contentScore + contentBonus) >= + siblingScoreThreshold) + ) { append = true + } else if (shouldKeepSibling(sibling)) { + val linkDensity = this.getLinkDensity(sibling) + val nodeContent = this.getInnerText(sibling) + val nodeLength = nodeContent.length + + if (nodeLength > 80 && linkDensity < 0.25) { + append = true + } else if ( + nodeLength in 1..79 && + linkDensity == 0.0 && + nodeContent.contains("\\.( |$)".toRegex()) + ) { + append = true + } } } - } - if(append) { - log.debug("Appending node: {}", sibling) + if (append) { + log.info("Appending node: {}", sibling.log()) - if(ALTER_TO_DIV_EXCEPTIONS.contains(sibling.tagName()) == false) { - // We have a node that isn't a common block level element, like a form or td tag. - // Turn it into a div so it doesn't get filtered out later by accident. - log.debug("Altering sibling: {} to div.", sibling) + if (sibling.tagName() !in ALTER_TO_DIV_EXCEPTIONS) { + // We have a node that isn't a common block level element, like a form or td tag. + // Turn it into a div so it doesn't get filtered out later by accident. + log.info("Altering sibling: {} to div.", sibling.log()) - setNodeTag(sibling, "div") - } + setNodeTag(sibling, "div") + } - articleContent.appendChild(sibling) + articleContent.appendChild(sibling) + } } } - return articleContent } - protected open fun shouldKeepSibling(sibling: Element): Boolean { + open fun shouldKeepSibling(sibling: Element): Boolean { return sibling.tagName() == "p" } @@ -736,51 +852,35 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v * Prepare the article node for display. Clean out any inline styles, * iframes, forms, strip extraneous

tags, etc. */ - protected open fun prepArticle(articleContent: Element, options: ArticleGrabberOptions, metadata: ArticleMetadata) { - this.cleanStyles(articleContent) + private fun prepArticle(articleContent: Element) { + //removed metadata as it isn't used anymore + cleanStyles(articleContent) // Check for data tables before we continue, to avoid removing items in // those tables, which will often be isolated even though they're // visually linked to other content-ful elements (text, images, etc.). markDataTables(articleContent) + fixLazyImages(articleContent) + // Clean out junk from the article content - this.cleanConditionally(articleContent, "form", options) - this.cleanConditionally(articleContent, "fieldset", options) + this.cleanConditionally(articleContent, "form") + this.cleanConditionally(articleContent, "fieldset") this.clean(articleContent, "object") this.clean(articleContent, "embed") this.clean(articleContent, "footer") this.clean(articleContent, "link") + this.clean(articleContent, "aside") // Clean out elements have "share" in their id/class combinations from final top candidates, // which means we don't remove the top candidates even they have "share". - val shareRegex = "share".toRegex() - articleContent.children().forEach { topCandidate -> - cleanMatchedNodes(topCandidate, shareRegex) - } - // If there is only one h2 and its text content substantially equals article title, - // they are probably using it as a header and not a subheader, - // so remove it since we already extract the title separately. - val h2 = articleContent.getElementsByTag("h2") - if (h2.size == 1) { - metadata.title?.let { articleTitle -> - if(articleTitle.length > 0) { - val lengthSimilarRate = (h2[0].text().length - articleTitle.length) / articleTitle.length.toFloat() - if (Math.abs(lengthSimilarRate) < 0.5) { - val titlesMatch = - if(lengthSimilarRate > 0) { - h2[0].text().contains(articleTitle) - } - else { - articleTitle.contains(h2[0].text()) - } - - if(titlesMatch) { - this.clean(articleContent, "h2") - } - } - } + val shareElementThreshold = ReadabilityOptions.DEFAULT_CHAR_THRESHOLD + + articleContent.children().forEach { topCandidate -> + cleanMatchedNodes(topCandidate){ node, className-> + regex.isShareElement(className) && + node.text().length < shareElementThreshold } } @@ -789,171 +889,311 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v this.clean(articleContent, "textarea") this.clean(articleContent, "select") this.clean(articleContent, "button") - this.cleanHeaders(articleContent, options) + this.cleanHeaders(articleContent) // Do these last as the previous stuff may have removed junk // that will affect these - this.cleanConditionally(articleContent, "table", options) - this.cleanConditionally(articleContent, "ul", options) - this.cleanConditionally(articleContent, "div", options) + this.cleanConditionally(articleContent, "table") + this.cleanConditionally(articleContent, "ul") + this.cleanConditionally(articleContent, "div") + + // replace H1 with H2 as H1 should be only title that is displayed separately + this.replaceNodeTags(articleContent.getElementsByTag("h1"), "h2") // Remove extra paragraphs - removeNodes(articleContent, "p") { paragraph -> - val imgCount = paragraph.getElementsByTag("img").size - val embedCount = paragraph.getElementsByTag("embed").size - val objectCount = paragraph.getElementsByTag("object").size - // At this point, nasty iframes have been removed, only remain embedded video ones. - val iframeCount = paragraph.getElementsByTag("iframe").size - val totalCount = imgCount + embedCount + objectCount + iframeCount - - return@removeNodes totalCount == 0 && getInnerText(paragraph, normalizeSpaces = false).length == 0 + removeNodes(articleContent.getElementsByTag("p")) { paragraph -> + // At this point, nasty iframes have been removed; only embedded video + // ones remain. + val contentElementCount = paragraph.getAllNodesWithTag(arrayOf( + "img", + "embed", + "object", + "iframe" + )).size + + return@removeNodes contentElementCount == 0 && getInnerText( + paragraph, + normalizeSpaces = false + ).isEmpty() } articleContent.select("br").forEach { br -> - val next = nextElement(br.nextSibling(), regEx) - if(next != null && next.tagName() == "p") { + val next = nextNode(br.nextSibling()) + if(next != null && next is Element && next.tagName() == "p") { br.remove() } } + // Remove single-cell tables + articleContent.getElementsByTag("table").forEach { table -> + val tbody = if (this.hasSingleTagInsideElement(table, "tbody")) + table.firstElementChild() + else table + if (tbody?.let { this.hasSingleTagInsideElement(it, "tr") } == true) { + val row = tbody.firstElementChild() + if (row?.let { hasSingleTagInsideElement(it, "td") } == true) { + row.firstElementChild()?.let { cell -> + cell.tagName(if (cell.children().all { isPhrasingContent(it) }) "p" else "div") + table.replaceWith(cell) + } + } + } + } + } + + /* convert images and figures that have properties like data-src into images that can be loaded without JS */ + private fun fixLazyImages(root: Element) { + + root.getAllNodesWithTag(arrayOf("img","picture","figure")) + .forEach function@ { elem -> + + // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute. + // So, here we check if the data uri is too short, just might as well remove it. + var attributes = elem.attributes().toList() + if (elem.attr("src").isNotBlank() && regex.isB64Data(elem.attr("src"))) { + // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes. + val parts = regex.getB64Matches(elem.attr("src")) + val dataType = parts?.groups?.get(1)?.value + if ( dataType == "image/svg+xml") { + return@function + } + + // Make sure this element has other attributes which contains image. + // If it doesn't, then this src is important and shouldn't be removed. + var srcCouldBeRemoved = false + attributes.forEach { attr-> + if (!srcCouldBeRemoved && (attr.key != "src") && + "\\.(jpg|jpeg|png|webp)" + .toRegex(RegexOption.IGNORE_CASE) + .containsMatchIn(attr.value)) { + srcCouldBeRemoved = true + } + } + + // Here we assume if image is less than 100 bytes (or 133 after encoded to base64) + // it will be too small, therefore it might be placeholder image. + if (srcCouldBeRemoved) { + //if you get there this isn't possible to be null + val b64starts = parts?.groups?.get(0)?.value?.length!! + val b64length = elem.attr("src").length - b64starts + if (b64length < 133) { + elem.removeAttr("src") + } + } + } + + // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580 + // but this one only applies to js + if ( + (elem.attr("src").isNotBlank() || elem.attr("srcset").isNotBlank()) && + "lazy" !in elem.className().lowercase() + ) { + return@function + } + attributes=elem.attributes().toList() + attributes.forEach attrs@{ attr-> + if ( + attr.key == "src" || + attr.key == "srcset" || + attr.key == "alt" + ) { + return@attrs + } + var copyTo:String? = null + if ( "\\.(jpg|jpeg|png|webp)\\s+\\d".toRegex().containsMatchIn(attr.value)) { + copyTo = "srcset" + } else if ("^\\s*\\S+\\.(jpg|jpeg|png|webp)\\S*\\s*$".toRegex().containsMatchIn(attr.value)) { + copyTo = "src" + } + if (copyTo!=null) { + //if this is an img or picture, set the attribute directly + if (elem.tagName() == "img" || elem.tagName() == "picture") { + elem.attr(copyTo, attr.value) + } else if ( + elem.tagName() == "figure" && + (elem.getAllNodesWithTag(arrayOf("img","picture"))).isEmpty() + ) { + //if the item is a

that does not contain an image or picture, create one and place it inside the figure + //see the nytimes-3 testcase for an example + val img = Element("img") + img.attr(copyTo, attr.value) + elem.appendChild(img) + } + } + } + } } /** * Remove the style attribute on every e and under. * TODO: Test if getElementsByTagName(*) is faster. */ - protected open fun cleanStyles(e: Element) { + private fun cleanStyles(e: Element) { if(e.tagName() == "svg") { return } - if(e.className() !== "readability-styled") { - // Remove `style` and deprecated presentational attributes - PRESENTATIONAL_ATTRIBUTES.forEach { attributeName -> - e.removeAttr(attributeName) - } +// Not in Readability.js +// if(e.className() != "readability-styled") { + // Remove `style` and deprecated presentational attributes + PRESENTATIONAL_ATTRIBUTES.forEach { attributeName -> + e.removeAttr(attributeName) + } - if(DEPRECATED_SIZE_ATTRIBUTE_ELEMS.contains(e.tagName())) { - e.removeAttr("width") - e.removeAttr("height") - } + if(DEPRECATED_SIZE_ATTRIBUTE_ELEMS.contains(e.tagName())) { + e.removeAttr("width") + e.removeAttr("height") } +// } e.children().forEach { child -> cleanStyles(child) } } - protected open fun markDataTables(root: Element) { + /** + * Look for 'data' (as opposed to 'layout') tables, for which we use + * similar checks as + * https://searchfox.org/mozilla-central/rev/f82d5c549f046cb64ce5602bfd894b7ae807c8f8/accessible/generic/TableAccessible.cpp#19 + */ + private fun markDataTables(root: Element) { root.getElementsByTag("table").forEach outer@ { table -> val role = table.attr("role") if(role == "presentation") { - setReadabilityDataTable(table, false) + table._readabilityDataTable=false return@outer } val datatable = table.attr("datatable") if(datatable == "0") { - setReadabilityDataTable(table, false) + table._readabilityDataTable=false return@outer } val summary = table.attr("summary") if(summary.isNotBlank()) { - setReadabilityDataTable(table, true) + table._readabilityDataTable=true return@outer } val caption = table.getElementsByTag("caption") if(caption.size > 0 && caption[0].childNodeSize() > 0) { - setReadabilityDataTable(table, true) + table._readabilityDataTable=true return@outer } - // If the table has a descendant with any of these tags, consider a data table: (move to DATA_TABLE_DESCENDANTS to make code a more readable and a bit faster) - DATA_TABLE_DESCENDANTS.forEach { tag -> + + // If the table has a descendant with any of these tags, consider a data table + val dataTableDescendants = listOf("col", "colgroup", "tfoot", "thead", "th") + dataTableDescendants.forEach { tag -> if(table.getElementsByTag(tag).size > 0) { - log.debug("Data table because found data-y descendant") - setReadabilityDataTable(table, true) + log.info("Data table because found data-y descendant") + table._readabilityDataTable=true return@outer } } // Nested tables indicate a layout table: - if(table.getElementsByTag("table").size > 0) { - setReadabilityDataTable(table, false) + // Js dont look for the same element or that looks like + if(table.getElementsByTag("table").size > 1) { + table._readabilityDataTable= false return@outer } val sizeInfo = getRowAndColumnCount(table) + + if (sizeInfo.second == 1 || sizeInfo.first == 1) { + // single colum/row tables are commonly used for page layout purposes. + table._readabilityDataTable = false + return@outer + } + if (sizeInfo.first >= 10 || sizeInfo.second > 4) { - setReadabilityDataTable(table, true) + table._readabilityDataTable= true return@outer } // Now just go by size entirely: - setReadabilityDataTable(table, sizeInfo.first * sizeInfo.second > 10) + table._readabilityDataTable= (sizeInfo.first * sizeInfo.second > 10) } } /** * Return an object indicating how many rows and columns this table has. */ - protected open fun getRowAndColumnCount(table: Element): Pair { + private fun getRowAndColumnCount(table: Element): Pair { var rows = 0 var columns = 0 - val trs = table.getElementsByTag("tr") - trs.forEach { tr -> + table.getElementsByTag("tr").forEach { row -> rows += - try { - tr.attr("rowspan").toInt() - } catch(ignored: Exception) { - 1 - } + row.attr("rowspan") + .takeIf { "^\\d.".toRegex().matches(it) }?.toInt() ?:1 // Now look for column-related info var columnsInThisRow = 0 - tr.getElementsByTag("td").forEach { cell -> - columnsInThisRow += - try { - cell.attr("colspan").toInt() - } catch(ignored: Exception) { - 1 - } + row.getElementsByTag("td").forEach { cell -> + columnsInThisRow += cell.attr("colspan") + .takeIf { "^\\d.".toRegex().matches(it) }?.toInt() ?:1 } - columns = Math.max(columns, columnsInThisRow) + columns = max(columns, columnsInThisRow) } return Pair(rows, columns) } - protected open fun cleanConditionally(e: Element, tag: String, options: ArticleGrabberOptions) { - if(options.cleanConditionally == false) + private fun cleanConditionally(e: Element, tag: String) { + if(!options.cleanConditionally) return - val isList = tag == "ul" || tag == "ol" - // Gather counts for other typical elements embedded within. // Traverse backwards so we can remove nodes at the same time // without effecting the traversal. // // TODO: Consider taking into account original contentScore here. - removeNodes(e, tag) { node -> - // First check if we're in a data table, in which case don't remove us. - val isDataTable: (Element) -> Boolean = { element -> - getReadabilityDataTable(element) + removeNodes(e.getElementsByTag(tag)) filterFunction@ { node -> + + // First check if this node IS data table, in which case don't remove it. + val isDataTable: (Element) -> Boolean = { it._readabilityDataTable } + + var isList = tag == "ul" || tag == "ol" + + if (!isList) { + var listLength = 0 + val listNodes = node.getAllNodesWithTag(arrayOf("ul", "ol")) + listNodes.forEach{ list -> + listLength += getInnerText(list).length + } + val nodeTextLength = getInnerText(node).length + if (nodeTextLength!=0) + isList = listLength / nodeTextLength > 0.9 } + + if (tag == "table" && isDataTable(node)) { + return@filterFunction false + } + + // Next check if we're inside a data table, in which case don't remove it as well. if(hasAncestorTag(node, "table", -1, isDataTable)) { - return@removeNodes false + return@filterFunction false } - val weight = getClassWeight(node, options) - val contentScore = 0 + if (hasAncestorTag(node, "code")) { + return@filterFunction false + } + + // keep element if it has a data tables + if (node.getElementsByTag("table").any { tbl -> tbl._readabilityDataTable }) { + return@filterFunction false + } - log.debug("Cleaning Conditionally {}", node) + val weight = getClassWeight(node) + + log.info("Cleaning Conditionally {}", node.log()) + + val contentScore = 0 if(weight + contentScore < 0) { - return@removeNodes true + return@filterFunction true } if(getCharCount(node, ',') < 10) { @@ -964,51 +1204,150 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v val img = node.getElementsByTag("img").size val li = node.getElementsByTag("li").size - 100 val input = node.getElementsByTag("input").size + val headingDensity = getTextDensity(node, arrayOf( + "h1", + "h2", + "h3", + "h4", + "h5", + "h6" + )) var embedCount = 0 - node.getElementsByTag("embed").forEach { - if(regEx.isVideo(it.attr("src")) == false) { - embedCount += 1 + node.getAllNodesWithTag(arrayOf("object", "embed", "iframe")).forEach { embed-> + // If this embed has attribute that matches video regex, don't delete it. + if (embed.attributes().any { attr-> + attr.value.let { + regex.hasAllowedVideo(it) + } + }){ + return@filterFunction false + } + // For embed with tag, check inner HTML as well. + if(embed.tagName() == "object" && + regex.hasAllowedVideo(embed.html()) + ){ + return@filterFunction false } + + embedCount += 1 } + val innerText = getInnerText(node) + + // toss any node whose inner text contains nothing but suspicious words + if ( + regex.hasAdWords(innerText) || + regex.hasLoadingWords(innerText) + ) { + return@filterFunction true + } + + val contentLength = innerText.length val linkDensity = getLinkDensity(node) - val contentLength = getInnerText(node, regEx).length - - val haveToRemove = - (img > 1 && p / img.toFloat() < 0.5 && !hasAncestorTag(node, "figure")) || - (!isList && li > p) || - (input > Math.floor(p/3.0)) || - (!isList && contentLength < 25 && img == 0 && !hasAncestorTag(node, "figure")) || - (!isList && weight < 25 && linkDensity > 0.2) || - (weight >= 25 && linkDensity > 0.5) || - ((embedCount == 1 && contentLength < 75) || embedCount > 1) - return@removeNodes haveToRemove + val textishTags = arrayOf("span", "li", "td")+ DIV_TO_P_ELEMS + val textDensity = getTextDensity(node,textishTags) + val isFigureChild = hasAncestorTag(node,"figure") + val shouldRemoveNode: () -> Boolean = { + val errs= arrayListOf() + if (!isFigureChild && img > 1 && p.toDouble() / img.toDouble() < 0.5 ) { + errs.add("Bad p to img ratio (img=${img}, p=${p})") + } + if (!isList && li > p){ + errs.add("Too many li's outside of a list. (li=${li} > p=${p})") + } + if(input > floor(p/3.0)){ + errs.add("Too many inputs per p. (input=${input}, p=${p})") + } + if(!isList && + !isFigureChild && + headingDensity < 0.9 && + contentLength < 25 && + (img == 0 || img>2) && + linkDensity > 0) { + errs.add("Suspiciously short. (headingDensity=${headingDensity}, img=${img}, linkDensity=${linkDensity})") + } + if(!isList && weight < 25 && linkDensity > 0.2){ + errs.add("Low weight and a little linky. (linkDensity=${linkDensity})") + } + if(weight >= 25 && linkDensity > 0.5){ + errs.add("High weight and mostly links. (linkDensity=${linkDensity})") + } + if((embedCount == 1 && contentLength < 75) || embedCount > 1){ + errs.add("Suspicious embed. (embedCount=${embedCount}, contentLength=${contentLength})") + } + if(img == 0 && textDensity == 0.0){ + errs.add("No useful content. (img=${img}, textDensity=${textDensity})") + } + + if (errs.size>0){ + log.info("Checks failed {}",errs.joinToString(", ","["," ]")) + } + + errs.size!=0 + } + + val haveToRemove = shouldRemoveNode() + + if (isList && haveToRemove){ + node.children().forEach { child-> + // Don't filter in lists with li's that contain more than one child + if (child.children().size > 1) { + @Suppress("KotlinConstantConditions") + // just for make it "exact" as js code for the reader + return@filterFunction haveToRemove + } + } + val liCount = node.getElementsByTag("li").size + + // Only allow the list to remain if every li contains an image + if (img == liCount) { + return@filterFunction false + } + } + return@filterFunction haveToRemove } + return@filterFunction false + } + } - return@removeNodes false + private fun getTextDensity(e: Element, tags: Array): Double { + val textLength = getInnerText(e,true).length + if (textLength == 0) { + return 0.0 + } + var childrenLength = 0.0 + val children = e.getAllNodesWithTag(tags).filterNot { it == e } + children.forEach{ + child -> + childrenLength += this.getInnerText(child, true).length } + return childrenLength / textLength } /** * Check if a given node has one of its ancestor tag name matching the * provided one. */ - protected open fun hasAncestorTag(node: Element, tagName: String, maxDepth: Int = 3, filterFn: ((Element) -> Boolean)? = null): Boolean { - val tagNameLowerCase = tagName.toLowerCase() + private fun hasAncestorTag(node: Element, tagName: String, maxDepth: Int = 3, + filterFn: ((Element) -> Boolean) = { + true //bc you don't want a null exception + }): Boolean { var parent = node var depth = 0 while(parent.parent() != null) { - if(maxDepth > 0 && depth > maxDepth) { + if(maxDepth in 1.. + removeNodes(e.getElementsByTag(tag)) filterFunction@ { element -> // Allow youtube and vimeo videos through as people usually want to see those. if(isEmbed) { - val attributeValues = element.attributes().map { it.value }.joinToString("|") + val attributeValues = element.attributes().joinToString("|") { it.value } // First, check the elements attributes to see if any of them contain youtube or vimeo - if(regEx.isVideo(attributeValues)) { - return@removeNodes false + if(regex.hasAllowedVideo(attributeValues)) { + return@filterFunction false } - // Then check the elements inside this element for the same. - if(regEx.isVideo(element.html())) { - return@removeNodes false + // For embed with tag, check inner HTML as well. + if(element.tagName() == "object" && regex.hasAllowedVideo(element.html())) { + return@filterFunction false } } - return@removeNodes true + return@filterFunction true } } /** * Clean out elements whose id/class combinations match specific string. */ - protected open fun cleanMatchedNodes(e: Element, regex: Regex) { + private fun cleanMatchedNodes(e: Element,filterFn: (Element,String) -> Boolean) { val endOfSearchMarkerNode = getNextNode(e, true) var next = getNextNode(e) while(next != null && next != endOfSearchMarkerNode) { - if(regex.containsMatchIn(next.className() + " " + next.id())) { - next = removeAndGetNext(next, regex.pattern) - } - else { - next = getNextNode(next) + next = if(filterFn(next,next.className() + " " + next.id())) { + removeAndGetNext(next) + } else { + getNextNode(next) } } } @@ -1069,57 +1411,23 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v /** * Clean out spurious headers from an Element. Checks things like classnames and link density. */ - protected open fun cleanHeaders(e: Element, options: ArticleGrabberOptions) { - Arrays.asList("h1", "h2").forEach { - removeNodes(e, it) { header -> - getClassWeight(header, options) < 0 + private fun cleanHeaders(e: Element) { + removeNodes(e.getAllNodesWithTag(arrayOf("h1", "h2"))) { node -> + (getClassWeight(node) < 0).also { if (it) + log.info("Removing header with low class weight: {}", node.log() ) } } } - /* Util methods */ - - protected open fun removeAndGetNext(node: Element, reason: String = ""): Element? { - val nextNode = this.getNextNode(node, true) - printAndRemove(node, reason) - return nextNode - } - - /** - * Traverse the DOM from node to node, starting at the node passed in. - * Pass true for the second parameter to indicate this node itself - * (and its kids) are going away, and we want the next node over. - * - * Calling this in a loop will traverse the DOM depth-first. - */ - protected open fun getNextNode(node: Element, ignoreSelfAndKids: Boolean = false): Element? { - // First check for kids if those aren't being ignored - if(!ignoreSelfAndKids && node.children().size > 0) { - return node.child(0) - } - - // Then for siblings... - node.nextElementSibling()?.let { return it } - - // And finally, move up the parent chain *and* find a sibling - // (because this is depth-first traversal, we will have already - // seen the parent nodes themselves). - var parent = node.parent() - while(parent != null && parent.nextElementSibling() == null) { - parent = parent.parent() - } - - return parent?.nextElementSibling() - } - - protected open fun getTextDirection(topCandidate: Element, doc: Document) { + private fun getTextDirection(topCandidate: Element, doc: Document) { val ancestors = mutableSetOf(topCandidate.parent(), topCandidate) - ancestors.addAll(getNodeAncestors(topCandidate.parent())) + ancestors.addAll(topCandidate.parent()?.let { getNodeAncestors(it) }?: listOf()) ancestors.add(doc.body()) ancestors.add(doc.selectFirst("html")) // needed as dir is often set on html tag ancestors.filterNotNull().forEach { ancestor -> + val articleDir = ancestor.attr("dir") if (articleDir.isNotBlank()) { this.articleDir = articleDir @@ -1129,16 +1437,25 @@ open class ArticleGrabber(protected val options: ReadabilityOptions, protected v } - protected open fun getReadabilityObject(element: Element): ReadabilityObject? { - return readabilityObjects[element] - } + private var Element.readability : ReadabilityObject? + set(value) { + if (value!=null) + readabilityObjects[this]=value + } + get(){ + return readabilityObjects[this] + } + + private var Element._readabilityDataTable : Boolean + set(value){ + readabilityDataTable[this]=value + } + get(){ + return readabilityDataTable[this] ?: false + } + - protected open fun getReadabilityDataTable(table: Element): Boolean { - return this.readabilityDataTable[table] ?: false - } - protected open fun setReadabilityDataTable(table: Element, readabilityDataTable: Boolean) { - this.readabilityDataTable.put(table, readabilityDataTable) - } } + diff --git a/src/main/kotlin/net/dankito/readability4j/processor/MetadataParser.kt b/src/main/kotlin/net/dankito/readability4j/processor/MetadataParser.kt index a84e782..8bd24ed 100644 --- a/src/main/kotlin/net/dankito/readability4j/processor/MetadataParser.kt +++ b/src/main/kotlin/net/dankito/readability4j/processor/MetadataParser.kt @@ -1,106 +1,213 @@ package net.dankito.readability4j.processor +import com.fasterxml.jackson.databind.JsonNode +import com.fasterxml.jackson.databind.ObjectMapper import net.dankito.readability4j.model.ArticleMetadata -import net.dankito.readability4j.util.RegExUtil +import net.dankito.readability4j.util.BaseRegexUtil import org.jsoup.nodes.Document -import java.util.regex.Pattern +import org.slf4j.LoggerFactory +import java.util.Deque +import java.util.Queue -open class MetadataParser(protected val regEx: RegExUtil = RegExUtil()): ProcessorBase() { +open class MetadataParser(override val regex: BaseRegexUtil = BaseRegexUtil()): ProcessorBase() { + private val log = LoggerFactory.getLogger(MetadataParser::class.java) - open fun getArticleMetadata(document: Document): ArticleMetadata { + /** + * Attempts to get excerpt and byline metadata for the article. + * + * @param document — the Document + * @param jsonld — object containing any metadata that + * could be extracted from JSON-LD object. + * + * @return ArticleMetadata with optional "excerpt" and "byline" properties + */ + open fun getArticleMetadata(document: Document,jsonld:ArticleMetadata?): ArticleMetadata { val metadata = ArticleMetadata() val values = HashMap() + val metaElements = document.getElementsByTag("meta"); - // Match "description", or Twitter's "twitter:description" (Cards) - // in name attribute. - val namePattern = Pattern.compile("^\\s*((twitter)\\s*:\\s*)?(description|title)\\s*$", Pattern.CASE_INSENSITIVE) + // property is a space-separated list of values + val propertyPattern = Regex("\\s*(article|dc|dcterm|og|twitter)\\s*:\\s*(author|creator|description|published_time|title|site_name)\\s*", RegexOption.IGNORE_CASE) - // Match Facebook's Open Graph title & description properties. - val propertyPattern = Pattern.compile("^\\s*og\\s*:\\s*(description|title)\\s*$", Pattern.CASE_INSENSITIVE) + // name is a single value + val namePattern = Regex("^\\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\\s*[-.:]\\s*)?(author|creator|pub-date|description|title|site_name)\\s*$", RegexOption.IGNORE_CASE) - document.select("meta").forEach { element -> + metaElements.forEach { element -> val elementName = element.attr("name") val elementProperty = element.attr("property") + val content = element.attr("content") - if(elementName == "author" || elementProperty == "author") { - metadata.byline = element.attr("content") + if (content.isBlank()){ return@forEach } var name: String? = null - if(namePattern.matcher(elementName).find()) { - name = elementName - } - else if(propertyPattern.matcher(elementProperty).find()) { - name = elementProperty - } - - if(name != null) { - val content = element.attr("content") - if(content.isNullOrBlank() == false) { + var matches:MatchResult?=null + if(elementProperty.isNotBlank()) { + matches=propertyPattern.find(elementProperty) + if (matches!=null) { // Convert to lowercase and remove any whitespace // so we can match below. - name = name.toLowerCase().replace("\\s".toRegex(), "") - values[name] = content.trim().replace(" ", " ") + name = matches.groupValues[0].lowercase() + .replace("\\s".toRegex(), "") + // multiple authors + values[name] = content.trim() } } + if(matches==null && elementName.isNotEmpty() && namePattern.matches(elementName)) { + // Convert to lowercase and remove any whitespace + // so we can match below. + name = elementName.lowercase() + .replace("\\s".toRegex(), "") + .replace('.',':') + values[name] = content.trim() + } } - metadata.excerpt = values["description"] ?: - values["og:description"] ?: // Use facebook open graph description. - values["twitter:description"] // Use twitter cards description. + // get title + metadata.title = jsonld?.title ?: + values["dc:title"] ?: + values["dcterm:title"] ?: + values["og:title"] ?: + values["weibo:article:title"] ?: + values["weibo:webpage:title"] ?: + values["title"] ?: + values["twitter:title"] ?: + values["parsely-title"] - metadata.title = getArticleTitle(document) - if(metadata.title.isNullOrBlank()) { - metadata.title = values["og:title"] ?: // Use facebook open graph title. - values["twitter:title"] // Use twitter cards title. - ?: "" + if (metadata.title==null) { + metadata.title = this.getArticleTitle(document) } - metadata.charset = document.charset()?.name() + val articleAuthor = if (values["article:author"]!=null && + !this.isUrl(values["article:author"])) values["article:author"] else null + + // get author + metadata.byline = jsonld?.byline ?: + values["dc:creator"] ?: + values["dcterm:creator"] ?: + values["author"] ?: + values["parsely-author"] ?: + articleAuthor + + // get description + metadata.excerpt = jsonld?.excerpt ?: + values["dc:description"] ?: + values["dcterm:description"] ?: + values["og:description"] ?: + values["weibo:article:description"] ?: + values["weibo:webpage:description"] ?: + values["description"] ?: + values["twitter:description"] + + // get site name + metadata.siteName = jsonld?.siteName ?: values["og:site_name"] + + // get article published time + metadata.publishedTime = jsonld?.datePublished ?: + values["article:published_time"] ?: + values["parsely-pub-date"] + + //not anymore + //metadata.charset = document.charset().name() + metadata.title = unescapeHtmlEntities(metadata.title) + metadata.byline = unescapeHtmlEntities(metadata.byline) + metadata.excerpt = unescapeHtmlEntities(metadata.excerpt) + metadata.siteName = unescapeHtmlEntities(metadata.siteName) + metadata.publishedTime = unescapeHtmlEntities(metadata.publishedTime) return metadata } - protected open fun getArticleTitle(doc: Document): String { + private fun unescapeHtmlEntities(str:String?):String? { + if (str==null) { + return null + } + + val htmlEscapeMap = mapOf( + "lt" to "<", + "gt" to ">", + "amp" to "&", + "quot" to "\"", + "apos" to "'", + ) + var unescaped = Regex("&(quot|amp|apos|lt|gt);").replace(str) { result -> + val tag = result.groupValues[1] + htmlEscapeMap[tag] ?: result.value + } + unescaped=Regex("&#(?:x([0-9a-f]+)|([0-9]+));",RegexOption.IGNORE_CASE) + .replace(unescaped) { result-> + val hex= result.groups[1]?.value + val numStr = result.groups[2]?.value + if (hex!=null||numStr!=null) { + + var num = hex?.toBigInteger(16)?.toInt() ?: numStr!!.toInt(10) + // these character references are replaced by a conforming HTML parser + if (num == 0 || + (num > 0x10ffff||num <0||(hex!=null&&hex.length>6)) || //Java max int limit + (num in 0xd800..0xdfff)) { + num = 0xfffd + } + + return@replace String(intArrayOf(num),0,1) + } + "\uD83D\uDE2D \uD83D\uDE2D � �" + "&#xg; &#x1F62D; &#128557; &#xFFFFFFFF; &#x0;" + result.value + } + return unescaped + + } + + /** + * Get the article title as an H1. + * + * @return string + **/ + private fun getArticleTitle(doc: Document): String { var curTitle = "" var origTitle = "" try { - origTitle = doc.title() + origTitle = doc.title().trim() curTitle = origTitle // If they had an element with id "title" in their HTML if(curTitle.isBlank()) { - doc.select("#title").first()?.let { elementWithIdTitle -> - origTitle = getInnerText(elementWithIdTitle, regEx) + doc.select("title").first()?.let { elementWithIdTitle -> + origTitle = getInnerText(elementWithIdTitle) curTitle = origTitle } } } catch(e: Exception) {/* ignore exceptions setting the title. */} var titleHadHierarchicalSeparators = false + val wordCount:( String)->Int= { str-> + str.split("\\s+".toRegex()).size + } // If there's a separator in the title, first remove the final part - if(curTitle.contains(" [\\|\\-\\/>»] ".toRegex())) { - titleHadHierarchicalSeparators = curTitle.contains(" [\\/>»] ".toRegex()) - curTitle = origTitle.replace("(.*)[\\|\\-\\/>»] .*".toRegex(RegexOption.IGNORE_CASE), "$1") + val titleSeparators = "|\\-–—\\\\\\/>»"; + if(curTitle.contains("\\s[${titleSeparators}]\\s".toRegex())) { + titleHadHierarchicalSeparators = curTitle.contains("\\s[\\\\/>»]\\s".toRegex()) + val allSeparators = ("\\s[${titleSeparators}]\\s".toRegex(setOf(RegexOption.IGNORE_CASE))).findAll(origTitle) + curTitle = origTitle.substring(0, allSeparators.last().range.last-1 ) // If the resulting title is too short (3 words or fewer), remove // the first part instead: if(wordCount(curTitle) < 3) { - curTitle = origTitle.replace("[^\\|\\-\\/>»]*[\\|\\-\\/>»](.*)".toRegex(RegexOption.IGNORE_CASE), "$1") + curTitle = origTitle.replace("^[^${titleSeparators}]*[|${titleSeparators}]".toRegex(RegexOption.IGNORE_CASE), "") } } else if(curTitle.contains(": ")) { // Check if we have an heading containing this exact string, so we // could assume it's the full title. - val match = doc.select("h1, h2").filter { it.wholeText() == curTitle }.size > 0 + val match = doc.getAllNodesWithTag(arrayOf("h1","h2")).any { it.wholeText().trim() == curTitle.trim() } // If we don't, let's extract the title out of the original title string. - if(match == false) { + if(!match) { curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1) // If the title is now too short, try the first colon instead: @@ -118,11 +225,11 @@ open class MetadataParser(protected val regEx: RegExUtil = RegExUtil()): Process val hOnes = doc.getElementsByTag("h1") if(hOnes.size == 1) { - curTitle = getInnerText(hOnes[0], regEx) + curTitle = getInnerText(hOnes[0]) } } - curTitle = curTitle.trim() + curTitle = regex.normalize(curTitle.trim()) // If we now have 4 words or fewer as our title, and either no // 'hierarchical' separators (\, /, > or ») were found in the original // title or we decreased the number of words by more than 1 word, use @@ -130,15 +237,149 @@ open class MetadataParser(protected val regEx: RegExUtil = RegExUtil()): Process val curTitleWordCount = wordCount(curTitle) if(curTitleWordCount <= 4 && (!titleHadHierarchicalSeparators || - curTitleWordCount != wordCount(origTitle.replace("[\\|\\-\\/>»]+".toRegex(), "")) - 1)) { + curTitleWordCount != wordCount(origTitle.replace("\\s[${titleSeparators}]\\s".toRegex(), "")) - 1)) { curTitle = origTitle } return curTitle } - protected open fun wordCount(str: String): Int { - return str.split("\\s+".toRegex()).size + /** + * Try to extract metadata from JSON-LD object. + * For now, only Schema.org objects of type Article or its subtypes are supported. + * @return Object with any metadata that could be extracted (possibly none) + */ + open fun getJSONLD(doc: Document):ArticleMetadata?{ + var metadata:ArticleMetadata?=null + + doc.getElementsByTag("script").forEach{ jsonLdElement-> + if ( + metadata==null && + jsonLdElement.attr("type") == "application/ld+json" + ) { + + try { + // Strip CDATA markers if present + val content = jsonLdElement.html().replace( + Regex("^\\s*\\s*$"), + "" + ) + + var parsed = ObjectMapper().readTree(content) ?: return@forEach + + if (parsed.isArray) { + parsed = parsed.jsFind{ node -> + node.has("@type") && + regex.isJsonLDArticle(node.get("@type").textValue()) + }?:return@forEach + } + + val schemaDotOrgRegex = Regex("^https?://schema\\.org/?$") + val matches = + (parsed["@context"]?.isTextual?.let { + if (it) + schemaDotOrgRegex.containsMatchIn(parsed["@context"].textValue()) + else false + }==true) || + (parsed.get("@context")?.isObject?.let { + if (it) parsed["@context"]["@vocab"]?.isTextual?.let { + schemaDotOrgRegex.containsMatchIn(parsed["@context"]["@vocab"].textValue()) + }==true // we dont want a null here + else false + }==true) + + if (!matches) { + return@forEach + } + + if (parsed["@type"]==null && parsed["@graph"]?.let{parsed["@graph"].isArray}==true) { + parsed = parsed["@graph"].jsFind{ node -> + node.has("@type") && + regex.isJsonLDArticle(node.get("@type").textValue()) + }?: return@forEach + } + + if ( + parsed["@type"]==null || + !regex.isJsonLDArticle(parsed["@type"].textValue()) + ) { + return@forEach + } + + val nonNullMetadata = ArticleMetadata() + + if ( + parsed["name"]?.isTextual == true && + parsed["headline"]?.isTextual == true && + parsed["name"] != parsed["headline"] + ) { + // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz + // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either + // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default. + + val title = getArticleTitle(doc) + val nameMatches = textSimilarity(parsed["name"].textValue(), title) > 0.75 + val headlineMatches = textSimilarity(parsed["headline"].textValue(), title) > 0.75 + + if (headlineMatches && !nameMatches) { + nonNullMetadata.title = parsed["headline"].textValue() + } else { + nonNullMetadata.title = parsed["name"].textValue() + } + } else if (parsed["name"]?.isTextual == true) { + nonNullMetadata.title = parsed["name"].textValue().trim() + } else if (parsed["headline"]?.isTextual == true) { + nonNullMetadata.title = parsed["headline"].textValue().trim() + } + if (parsed.hasNonNull("author") ) { + if (parsed["author"]?.get("name")?.isTextual == true) { + val name = parsed["author"].get("name").textValue() + if(name.isNotBlank()) + nonNullMetadata.byline = name.trim() + } else if ( + parsed["author"].isArray && + parsed["author"].get(0)?.get("name")?.isTextual == true + ) { + nonNullMetadata.byline = parsed["author"].filter { author -> + author?.get("name")?.isTextual == true + }.joinToString(", ") { author -> + author["name"].textValue().trim() + } + } + } + if (parsed["description"]?.isTextual == true) { + nonNullMetadata.excerpt = parsed["description"].textValue().trim() + } + if (parsed["publisher"]?.isObject==true && + parsed["publisher"]?.get("name")?.isTextual == true) { + nonNullMetadata.siteName = parsed["publisher"]["name"].textValue().trim() + } + if (parsed["datePublished"]?.isTextual == true) { + nonNullMetadata.datePublished = parsed["datePublished"].textValue().trim() + } + metadata=nonNullMetadata + } catch (err:Exception) { + log.error("{}\n\n{}",err.message,err.stackTraceToString()) + println(err.message+"\n\n"+err.stackTraceToString()) + } + } + } + + return metadata } -} \ No newline at end of file + /** + * Array.Prototype.find (this should work like that one) + * + * @return null if don't find in the array + * */ + private fun JsonNode.jsFind(filterFun:(JsonNode)->Boolean):JsonNode?{ + if (this.isArray){ + for (value in this){ + if (filterFun(value)) + return value + } + } + return null + } +} diff --git a/src/main/kotlin/net/dankito/readability4j/processor/Postprocessor.kt b/src/main/kotlin/net/dankito/readability4j/processor/Postprocessor.kt index accf80d..fe1f992 100644 --- a/src/main/kotlin/net/dankito/readability4j/processor/Postprocessor.kt +++ b/src/main/kotlin/net/dankito/readability4j/processor/Postprocessor.kt @@ -1,142 +1,215 @@ package net.dankito.readability4j.processor -import org.jsoup.nodes.Document +import net.dankito.readability4j.model.ReadabilityOptions +import net.dankito.readability4j.util.BaseRegexUtil import org.jsoup.nodes.Element import org.jsoup.nodes.TextNode import org.slf4j.LoggerFactory import java.net.URI -import java.util.Arrays -import java.util.regex.Pattern +import java.net.URISyntaxException -open class Postprocessor { - - companion object { - val AbsoluteUriPattern = Pattern.compile("^[a-zA-Z][a-zA-Z0-9\\+\\-\\.]*:") - +open class Postprocessor(override val regex:BaseRegexUtil= BaseRegexUtil()):ProcessorBase() { + companion object { // These are the classes that readability sets itself. - val CLASSES_TO_PRESERVE = Arrays.asList("readability-styled", "page") - + val CLASSES_TO_PRESERVE = listOf("page") private val log = LoggerFactory.getLogger(Postprocessor::class.java) } - - open fun postProcessContent(originalDocument: Document, articleContent: Element, articleUri: String, - additionalClassesToPreserve: Collection = emptyList()) { + open fun postProcessContent(articleContent: Element,baseUri: String, documentUri: String, + options: ReadabilityOptions) { + // Readability cannot open relative uris so we convert them to absolute uris. - fixRelativeUris(originalDocument, articleContent, articleUri) + fixRelativeUris(articleContent, baseUri, documentUri) + simplifyNestedElements(articleContent) - // Remove IDs and classes. - // Remove classes. - val classesToPreserve = Arrays.asList(CLASSES_TO_PRESERVE, additionalClassesToPreserve).flatten().toSet() - cleanClasses(articleContent, classesToPreserve) + if (!options.keepClasses) { + // Remove classes. + this.cleanClasses(articleContent,options.additionalClassesToPreserve) + } } + private fun simplifyNestedElements(articleContent: Element) { + var node:Element?= articleContent + + while (node!=null){ + if ( + node.parentNode()!=null && + node.tagName() in arrayOf("div", "section") && + !(node.id().isNotBlank() && node.id().startsWith("readability")) + ) { + if (this.isElementWithoutContent(node)) { + node = this.removeAndGetNext(node) + continue + } else if ( + this.hasSingleTagInsideElement(node, "div") || + this.hasSingleTagInsideElement(node, "section") + ) { + val child = node.children()[0] + for (i in node.attributes()){ + child.attr(i.key,i.value) + } + node.replaceWith(child) + node = child + continue + } + } - /** - * Converts each and uri in the given element to an absolute URI, - * ignoring #ref URIs. - */ - protected open fun fixRelativeUris(originalDocument: Document, element: Element, articleUri: String) { - try { - val uri = URI.create(articleUri) - val scheme = uri.scheme - val prePath = uri.scheme + "://" + uri.host - val pathBase = uri.scheme + "://" + uri.host + uri.path.substring(0, uri.path.lastIndexOf("/") + 1) - - fixRelativeUris(originalDocument, element, scheme, prePath, pathBase) - } catch(e: Exception) { log.error("Could not fix relative urls for $element with base uri $articleUri", e) } + node = getNextNode(node) + } } - protected open fun fixRelativeUris(originalDocument: Document, element: Element, scheme: String, prePath: String, - pathBase: String) { - fixRelativeAnchorUris(element, scheme, prePath, pathBase) - - fixRelativeImageUris(element, scheme, prePath, pathBase) + /** + * Converts each and uri in the given element to an absolute uri, + * ignoring #ref uris. + */ + open fun fixRelativeUris(element: Element, baseUri: String, documentUri: String) { + try { + var realBaseUri=baseUri + //this because if no base tag the same behavior is in base javascript + if (baseUri.isBlank()&&documentUri.isBlank()) + return //nothing to do if no documentUri neither base tag + else if (baseUri.isBlank()) { + realBaseUri = documentUri + } + if (URI(realBaseUri).isAbsolute){ + fixRelativeAnchorUris(element, realBaseUri,documentUri) + fixRelativeImageUris(element, realBaseUri,documentUri) + } + }catch (e: URISyntaxException ){ + //this one is just the java variant of the error just in case + log.error("Could not fix relative uri for element:$element with base uri documentUri:$documentUri because it don't look a valid uri", e) + } catch(e: Exception) { + log.error("Could not fix relative uri for $element with base uri $documentUri", e) + } } - protected open fun fixRelativeAnchorUris(element: Element, scheme: String, prePath: String, pathBase: String) { + protected open fun fixRelativeAnchorUris(element: Element, baseURI:String, documentURI:String) { element.getElementsByTag("a").forEach { link -> - val href = link.attr("href") + val href = link.attr("href").trim() if(href.isNotBlank()) { // Replace links with javascript: URIs with text content, since // they won't work after scripts have been removed from the page. if(href.indexOf("javascript:") == 0) { - val text = TextNode(link.wholeText()) - link.replaceWith(text) + if ( + link.childNodes().size == 1 && + link.childNodes()[0] is TextNode + ) { + val text = TextNode(link.wholeText()) + link.replaceWith(text) + }else{ + // if the link has multiple children, they should all be preserved + val container = Element("span") + while (link.firstChild()!=null) { + link.firstChild()?.let { container.appendChild(it) } + } + link.replaceWith(container) + } } else { - link.attr("href", toAbsoluteURI(href, scheme, prePath, pathBase)) + link.attr("href", toAbsoluteURI(href, baseURI,documentURI)) } } } } - protected open fun fixRelativeImageUris(element: Element, scheme: String, prePath: String, pathBase: String) { - element.getElementsByTag("img").forEach { img -> - fixRelativeImageUri(img, scheme, prePath, pathBase) - } - } + protected open fun fixRelativeImageUris(element: Element, baseUri:String, docUri: String) { + val medias = element.getAllNodesWithTag(arrayOf( + "img", + "picture", + "figure", + "video", + "audio", + "source", + )) + + medias.forEach { media -> + val src = media.attr("src").trim() + val poster = media.attr("poster").trim() + val srcset = media.attr("srcset").trim() + + if(src.isNotBlank()) { + media.attr("src", toAbsoluteURI(src,baseUri,docUri)) + } - protected open fun fixRelativeImageUri(img: Element, scheme: String, prePath: String, pathBase: String) { - val src = img.attr("src") + if (poster.isNotBlank()) { + media.attr("poster", toAbsoluteURI(poster,baseUri,docUri)) + } + if (srcset.isNotBlank()) { + var newSrcset = "" + regex.getSrcSetMatches(srcset).map { it.groups }.forEach { group -> + val srcSetBaseUri=group[1]?.value + val srcSetSize=group[2]?.value + val srcSetSeparator=group[3]?.value + if (srcSetBaseUri!=null&&srcSetSeparator!=null){ + newSrcset+= toAbsoluteURI(srcSetBaseUri,baseUri,docUri)+(srcSetSize?:"")+srcSetSeparator + } + } - if(src.isNotBlank()) { - img.attr("src", toAbsoluteURI(src, scheme, prePath, pathBase)) + media.attr("srcset", newSrcset) + } } } - protected open fun toAbsoluteURI(uri: String, scheme: String, prePath: String, pathBase: String): String { - // If this is already an absolute URI, return it. - if(isAbsoluteUri(uri) || uri.length <= 2) { - return uri - } - - // Scheme-rooted relative URI. - if(uri.substring(0, 2) == "//") { - return scheme + "://" + uri.substring(2) - } + protected open fun toAbsoluteURI(uri: String, baseURI:String, documentURI:String): String { - // Prepath-rooted relative URI. - if(uri[0] == '/') { - return prePath + uri - } - - // Dotslash relative URI. - if(uri.indexOf("./") == 0) { - return pathBase + uri.substring(2) + // Leave hash links alone if the base URI matches the document URI: + if(baseURI==documentURI && uri[0] == '#') { + return uri } - // Ignore hash URIs: - if(uri[0] == '#') { - return uri + // Otherwise, resolve against base URI: + try { + //Zero width space breaks the Java URI match algorithm and and Redability.js just don't mind it + //in really really weird cases it can be in the url + //at least the 95% of code here is because javascript and java resolves god know how the uris but different + if(uri.startsWith("\u200B")||uri.startsWith("%E2%80%8B")){ + return URI(baseURI).resolve("").toString()+uri + }else if(uri.contains(Regex("^\\.\\./\\.\\./\\.\\./(\\.\\./)+"))){ + return URI(baseURI).resolve(uri.replace(Regex("^(\\.\\./)+"),"../../")).toString() + }else if(uri.contains(Regex("^file:"))){ + return uri.replaceFirst("file:///","file:/") //dont resolve file uris + }else { + val realUri = URI(baseURI).resolve(uri).toString() + + //also add to the latest part the / just for testing proposes as URI doesn't add it + val isOnlyFirstPart = Regex("^(?:http|https)://[a-zA-Z.0-9-_]+(?::\\d+)?(/)?\$") + return if (isOnlyFirstPart.matches(uri) && uri.last() != '/') { + "$realUri/" + } else { + val firstPartNeedsSlash = Regex("^(?:http|https)://[a-zA-Z.0-9-_]+(?::\\d+)?(/)?") + if(firstPartNeedsSlash.find(realUri)?.groups?.let { it[1] ==null }==true) + firstPartNeedsSlash.replace(realUri,"$0/") + else + realUri + } + } + } catch (_: Exception) { + // Something went wrong, just return the original: } - // Standard relative URI add entire path. pathBase already includes a - // trailing "/". - return pathBase + uri + return uri } - - protected open fun isAbsoluteUri(uri: String): Boolean { - return AbsoluteUriPattern.matcher(uri).find() - } - /** * Removes the class="" attribute from every element in the given * subtree, except those that match CLASSES_TO_PRESERVE and * the classesToPreserve array from the options object. + * + * @param node Element to clean it and their children + * @param classesToPreserve Set of Strings of ReadabilityOptions.classesToPreserve + * @return void */ protected open fun cleanClasses(node: Element, classesToPreserve: Set) { - val classNames = node.classNames().filter { classesToPreserve.contains(it) } + val classNames = node.classNames().filter { it in (classesToPreserve + CLASSES_TO_PRESERVE) } if(classNames.isNotEmpty()) { node.classNames(classNames.toMutableSet()) } else { - node.removeAttr("class") + node.classNames(setOf()) } node.children().forEach { child -> @@ -144,4 +217,4 @@ open class Postprocessor { } } -} \ No newline at end of file +} diff --git a/src/main/kotlin/net/dankito/readability4j/processor/Preprocessor.kt b/src/main/kotlin/net/dankito/readability4j/processor/Preprocessor.kt index 660a2d5..7507c0a 100644 --- a/src/main/kotlin/net/dankito/readability4j/processor/Preprocessor.kt +++ b/src/main/kotlin/net/dankito/readability4j/processor/Preprocessor.kt @@ -1,6 +1,6 @@ package net.dankito.readability4j.processor -import net.dankito.readability4j.util.RegExUtil +import net.dankito.readability4j.util.BaseRegexUtil import org.jsoup.nodes.Document import org.jsoup.nodes.Element import org.jsoup.nodes.Node @@ -9,11 +9,10 @@ import org.slf4j.LoggerFactory /** * Performs basic sanitization before starting the extraction process. */ -open class Preprocessor(protected val regEx: RegExUtil = RegExUtil()) : ProcessorBase() { +open class Preprocessor(override val regex: BaseRegexUtil = BaseRegexUtil()) : ProcessorBase() { + + private val log = LoggerFactory.getLogger(Preprocessor::class.java) - companion object { - private val log = LoggerFactory.getLogger(Preprocessor::class.java) - } /** @@ -24,40 +23,134 @@ open class Preprocessor(protected val regEx: RegExUtil = RegExUtil()) : Processo log.debug("Starting to prepare document") removeScripts(document) - removeNoscripts(document); - removeStyles(document) + removeNodes(document.getElementsByTag("style")) - removeForms(document) // TODO: this is not in Mozilla's Readability +// removeForms(document) // TODO: this was moved in Mozilla's Readability to on grabArticle - removeComments(document) // TODO: this is not in Mozilla's Readability +// removeComments(document) // TODO: this is not in Mozilla's Readability now - replaceBrs(document, regEx) + replaceBrs(document) - replaceNodes(document, "font", "span") + replaceNodeTags(document.getElementsByTag("font"), "span") } + /** + * Removes script tags from the document. + * + * @param document + **/ + private fun removeScripts(document: Document) { + removeNodes(document.getAllNodesWithTag(arrayOf("script","noscript"))) + } - protected open fun removeScripts(document: Document) { - removeNodes(document, "script") { scriptNode -> - scriptNode.`val`(null) // TODO: what is this good for? - scriptNode.removeAttr("src") - true + /** + * Check if node is image, or if node contains exactly only one image + * whether as a direct child or as its descendants. + * + * @param noscript Element + **/ + private fun isSingleImage(noscript: Element): Boolean { + var element:Element?=noscript + while (element!=null){ + if (element.tagName() == "img") { + return true + } + if (element.children().size != 1 || element.wholeText().trim() != ""){ + return false + } + element = element.child(0) } + return false } - protected open fun removeNoscripts(document: Document) { - document.getElementsByTag("noscript").forEach { noscript -> - if(shouldKeepImageInNoscriptElement(document, noscript)) { // TODO: this is not in Mozilla's Readability - noscript.unwrap() + /** + * Find all