diff --git a/.gitignore b/.gitignore index 615c5c5c9..86120d4b6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,42 @@ +.idea +.idea/ +.idea_modules/ target/ /goose.iml /goose.ipr /goose.iws +config/development.scala +config/production.scala +config/test.scala +target/ +log/ +perf/run_ab +dist/ +project/boot/ +project/plugins/project/ +project/plugins/src_managed/ +*.log +*.tmproj +lib_managed/ +*.swp +*.iml +*~ +*# +.#* +.idea +.DS_Store +pmip/ +.history +.cache +.classpath +.project +.settings/ +Capfile.* +geoip-db/ +config/consumer.properties +release/ +*.sw* +.DS_Store +bin/ +/.cache-main +/.cache-tests diff --git a/README.md b/README.md index 47af399be..58ceee844 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,22 @@ #Goose - Article Extractor -##Intro +[Goose](https://github.com/GravityLabs/goose) fork published on Maven Central. + +## This is a fork + +If you haven't guessed already, this is a fork of the wonderful [Goose library](http://github.com/GravityLabs/goose) by Gravity Labs. The original repo hasn't been updated for 2 years now, and there have been quite [a few nice pull requests](https://github.com/GravityLabs/goose/pulls) that are lying dormant. +The project now uses sbt, and is hosted on Sonatype. Add the following to to your `build.sbt` to pull it in: -Goose was originally an article extractor written in Java that has most recently (aug2011) converted to a scala project. It's mission is to take any news article or article type web page and not only extract what is the main body of the article but also all meta data and most probable image candidate. +``` +libraryDependencies ++= Seq("com.gravity" %% "goose" % "2.1.25-SNAPSHOT") + +resolvers += Resolver.sonatypeRepo("public") +``` + +##Intro + +Goose was originally an article extractor written in Java that has most recently (aug2011) converted to a Scala project. It's mission is to take any news article or article type web page and not only extract what is the main body of the article but also all meta data and most probable image candidate. The extraction goal is to try and get the purest extraction from the beginning of the article for servicing flipboard/pulse type applications that need to show the first snippet of a web article along with an image. @@ -35,15 +48,52 @@ If you find Goose useful or have issues please drop me a line, I'd love to hear Goose is licensed by Gravity.com under the Apache 2.0 license, see the LICENSE file for more details +##Environment Prerequisites + +The default behaviour is by using java image processing capabilities. + +### ImageMagick + +You will need to have ImageMagick installed for Goose to work correctly. + +On osx, you can install with brew: + $ brew install imagemagick + +Update Configuration.scala with the location of identify and convert (eg /usr/local/bin) + ##Take it for a spin + +### SBT To use goose from the command line: + cd into the goose directory + sbt "run-main com.gravity.goose.TalkToMeGoose http://techcrunch.com/2011/05/13/native-apps-or-web-apps-particle-code-wants-you-to-do-both/" + +### MVN cd into the goose directory mvn compile MAVEN_OPTS="-Xms256m -Xmx2000m"; mvn exec:java -Dexec.mainClass=com.gravity.goose.TalkToMeGoose -Dexec.args="http://techcrunch.com/2011/05/13/native-apps-or-web-apps-particle-code-wants-you-to-do-both/" -e -q > ~/Desktop/gooseresult.txt -##Regarding the port from JAVA to Scala +##Testing +To run the junit tests, kick off the sbt test target: + + sbt test + +Note that there are currently problems in the tests. (8 failures in 41 tests on 2014-07-10 - raisercostin) + +##Usage as a maven dependency + +Last version (goose_2.10-2.2.0.jar) is hosted at http://raisercostin.googlecode.com/svn/maven2/com/gravity/goose/ +Goose is hosted on Sonatype's OSS repository, https://oss.sonatype.org/content/repositories/releases/com/gravity/goose/ + + + com.gravity + goose + 2.1.22 + + +##Regarding the port from Java to Scala Here are some of the reasons for the port to Scala: @@ -54,5 +104,47 @@ Here are some of the reasons for the port to Scala: ##Issues + It was a pretty fast Java to Scala port so lots of the nicities of the Scala language aren't in the codebase yet, but those will come over the coming months as we re-write alot of the internal methods to be more Scalesque. -We made sure it was still nice and operable from Java as well so if you're using goose from java you still should be able to use it with a few changes to the method signatures. \ No newline at end of file +We made sure it was still nice and operable from Java as well so if you're using goose from java you still should be able to use it with a few changes to the method signatures. + + +##Goose is now language aware + +The stopword lists introduced in the [Python-Goose project](https://github.com/grangier/python-goose) have been incorporated +into Goose. + + + +## Deploy libraries to bintray + + +- configure your ~/.m2/settings.xml as + ``` + + + raisercostin-releases + svn-user + svn-pass + + + ``` + +- deploy for scala 2.11 + ```mvn -f pom_scala211.xml deploy -DskipTests -Prelease``` + +- deploy for scala 2.10 + ```mvn -f pom_scala210.xml deploy -DskipTests -Prelease``` diff --git a/autorelease b/autorelease new file mode 100755 index 000000000..d0c46e00f --- /dev/null +++ b/autorelease @@ -0,0 +1,74 @@ +#!/bin/bash + +name="Capfile" +deploy_dir="deploy" +group=ad3 +tmp_file="$deploy_dir/$name.tmp" +conf_file="$deploy_dir/$name.conf" + +function createCap(){ + details=`awk -F ":" -v g=$group '$1==g {print $2}' $conf_file` + echo $details + IFS=, + ary=($details) + #count=${#ary[@]} + user=${ary[0]} + adminuser=${ary[1]} + port=${ary[2]} + hosts=${ary[3]} + ahost=${ary[@]:3:100} + IFS= + hosts=`echo $ahost|sed 's/ /,/g'` + + + echo "use group : "$group + echo "user: "$group + echo "admin_runner : "$adminuser + echo "port : "$port + echo "hosts : "$hosts + cp $tmp_file $group_file + sed -i "s/##USER##/$user/g" $group_file + sed -i "s/##ADMIN_RUNNER##/$adminuser/g" $group_file + sed -i "s/##PORT##/$port/g" $group_file + sed -i "s/##HOSTS##/$hosts/g" $group_file +} + +function createRelease(){ + + cd release + git pull origin master + + git rm ./*.jar + git rm config/* + git rm scripts/* + + cd .. + sbt clean update package-dist + cd release + + git status + git add . + git commit -m "make release" + git push origin master + + cd .. +} + + +if [ $# = 0 ] ; then + echo "Usage : ./deploy.sh group1 group2.." +else + for i in "$@" + do + if [ "$i" == "tmp" ] || [ "$i" == "conf" ]; then + echo "Group name can not be tmp or conf" + else + group=$i + #group_file="$deploy_dir/$name.$group" + group_file="$name" + createCap $i + #createRelease $i + cap -f $group_file deploy + fi + done +fi diff --git a/build.sbt b/build.sbt new file mode 100644 index 000000000..9f61dc60e --- /dev/null +++ b/build.sbt @@ -0,0 +1,154 @@ +import sbt._ +import Keys._ +import com.typesafe.sbteclipse.plugin.EclipsePlugin.EclipseKeys +//assembly: import com.twitter.sbt._ +//assembly: import AssemblyKeys._ + +//assembly: seq(StandardProject.newSettings: _*) + +//organization := "GravityLabs" +organization := "com.gravity" + +name := "goose" + +version := "2.2.2-SNAPSHOT" + +organizationHomepage := Some(url("http://gravity.com/")) + +homepage := Some(url("https://github.com/raisercostin/goose")) + +description := "Extracts text, metadata, and key image from web articles." + +licenses += "Apache2" -> url("http://www.apache.org/licenses/") + +scalaVersion := "2.11.2" +//scalaVersion := "2.11.1" +//scalaVersion := "2.10.2" + +crossScalaVersions := Seq("2.11.2")//, "2.11.0", "2.10.4") + +testFrameworks += TestFrameworks.ScalaTest + +testOptions in Test += Tests.Argument("-oF") + +//assembly: seq(assemblySettings: _*) +resolvers ++= Seq( + //"sonatype snapshots" at "https://oss.sonatype.org/content/repositories/snapshots", + "central mvn repo" at "http://repo1.maven.org/", + "Oracle Maven 2 Repository" at "http://download.oracle.com/maven", + "JBoss Maven 2 Repository" at "http://repository.jboss.com/maven2", + "JLangDetect Maven repository" at "https://jlangdetect.googlecode.com/svn/repo", + "raisercostin repository" at "http://raisercostin.googlecode.com/svn/maven2" +) + +credentials += Credentials(Path.userHome / ".ivy2" / ".credentials") + +libraryDependencies ++= { + Seq( + "org.slf4j" % "slf4j-api" % "1.7.7" + //,"org.slf4j" % "slf4j-simple" % "1.7.7" + ,"org.slf4j" % "slf4j-log4j12" % "1.7.7" % Test + ,"log4j" % "log4j" % "1.2.14" + ,"commons-io" % "commons-io" % "2.4" + ,"commons-lang" % "commons-lang" % "2.6" + ,"org.apache.httpcomponents" % "httpclient" % "4.3.3" + ,"com.ibm.icu" % "icu4j" % "53.1" + ,"me.champeau.jlangdetect" % "jlangdetect-extra" % "0.4" + ,"org.jsoup" % "jsoup" % "1.7.3" + ,"net.liftweb" %% "lift-json" % "2.6-RC1" + ,"com.github.nscala-time" %% "nscala-time" % "1.4.0" + ,"com.typesafe" % "config" % "1.0.2" + ,"com.jsuereth" %% "scala-arm" % "1.4" + ,"org.specs2" %% "specs2" % "2.3.11" + ,"org.jsoup" % "jsoup" % "1.7.3" + ,"com.chenlb.mmseg4j" % "mmseg4j-core" % "1.9.1" + ,"com.googlecode.juniversalchardet" % "juniversalchardet" % "1.0.3" + //gae + ,"com.google.appengine" % "appengine-api-labs" % "1.7.1" + ,"com.google.appengine" % "appengine-api-stubs" % "1.7.1" + ,"com.google.appengine" % "appengine-testing" % "1.7.1" + ,"com.google.appengine" % "appengine-api-1.0-sdk" % "1.7.1" + //add json service + ,"com.fasterxml.jackson.core" % "jackson-databind" % "2.4.3" + ,"com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.4.3" + ,"org.simpleframework" % "simple" % "4.1.21" + //add cassandra + //,"com.netflix.astyanax" % "astyanax-core" % "1.56.43" + //,"com.netflix.astyanax" % "astyanax-thrift" % "1.56.43" + //,"com.netflix.astyanax" % "astyanax-cassandra" % "1.56.43" + //tests + ,"junit" % "junit" % "4.11" % Test + ,"org.scalatest" %% "scalatest" % "2.2.1" % Test + // Testing dependencies + ,"com.novocode" % "junit-interface" % "0.10" % Test + //"org.scala-lang" % "scala-compiler" % "2.9.0-1", + //"org.scala-lang" % "scala-library" % "2.9.0-1", + //"org.scala-lang" % "scala-reflect" % "2.10.0", + ) +} + +publishMavenStyle := true + +pomIncludeRepository := { _ => true} + +//publishTo := Some(Resolver.file("Github Pages", Path.userHome /"repo" / "maven" asFile)(Patterns(true, Resolver.mavenStyleBasePattern))) +//publishTo := Some(Resolver.file("goose", new File("d:/Dropbox/public/libs"))(Patterns(true, Resolver.mavenStyleBasePattern)) ) +publishTo := Some(Resolver.file("goose", new File("./target/publish"))(Patterns(true, Resolver.mavenStyleBasePattern)) ) +//publishTo := { +// val nexus = "https://oss.sonatype.org/" +// if (isSnapshot.value) +// Some("snapshots" at nexus + "content/repositories/snapshots") +// else +// Some("releases" at nexus + "service/local/staging/deploy/maven2") +//} + +pomExtra := ( + + git@github.com:raisercostin/goose.git + scm:git:git@github.com:raisercostin/goose.git + +) + +EclipseKeys.createSrc := EclipseCreateSrc.Default + EclipseCreateSrc.Resource + +// Get rid of java source directories in compile +unmanagedSourceDirectories in Compile <<= (scalaSource in Compile)(Seq(_)) + +// Get rid of java source directories in test +unmanagedSourceDirectories in Test <<= (scalaSource in Test)(Seq(_)) + +//assembly: packageDistDir <<= (baseDirectory, packageDistName) { (b, n) => b / "release" } + +parallelExecution in Test := false + +net.virtualvoid.sbt.graph.Plugin.graphSettings + + +scalacOptions ++= Seq("-unchecked", "-deprecation") + +//to see https://bitbucket.org/diversit/webdav4sbt +def svnPublish = Command.args("svnPublish", "") { (state, args) => + val ver = "2.2.2-SNAPSHOT" + val svnUrl = """https://raisercostin.googlecode.com/svn/maven2""" + val command = s"""svn import -m "binary release" target\\publish\\com\\gravity\\goose\\goose_2.11\\$ver $svnUrl/com/gravity/goose/goose_2.11/$ver """ + println(s"\nexecute $command") + command.! + state +} + +commands ++= Seq(svnPublish) + +version <<= version { v => //only release *if* -Drelease=true is passed to JVM + val release = Option(System.getProperty("release")) == Some("true") + if (release) { + v + } else { + val suffix = Option(System.getProperty("suffix")) + val i = (v.indexOf('-'), v.length) match { + case (x, l) if x < 0 => l + case (x, l) if v substring (x + 1) matches """\d+""" => l //patch level, not RCx + case (x, _) => x + } + v.substring(0, i) + "-" + (suffix getOrElse "SNAPSHOT") + } +} diff --git a/gen-idea b/gen-idea new file mode 100755 index 000000000..cfc279e7b --- /dev/null +++ b/gen-idea @@ -0,0 +1,3 @@ +#!/bin/bash + +sbt 'gen-idea no-classifiers' diff --git a/history.txt b/history.txt new file mode 100644 index 000000000..1b7e997c6 --- /dev/null +++ b/history.txt @@ -0,0 +1,2 @@ +cat *de.txt|sort|uniq >stopwords-de2.txt +cat *.txt|sort|uniq >stopwords-all.txt2 \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 000000000..0d0182726 --- /dev/null +++ b/index.html @@ -0,0 +1,77 @@ + + + + + + geek4377/goose @ GitHub + + + + + + Fork me on GitHub + +
+ +
+ + + + +
+ +

goose + by geek4377

+ +
+ Html Content / Article Extractor in Java open sourced from Gravity Labs - http://gravity.com +
+ +

Authors

+

Umar Shah (geek4377@gmail.com)
Jim Plush (jiminoc@gmail.com)

+

Contact

+

(geek4377@gmail.com)

+ + +

Download

+

+ You can download this project in either + zip or + tar formats. +

+

You can also clone the project with Git + by running: +

$ git clone git://github.com/geek4377/goose
+

+ + + +
+ + + + diff --git a/misc/PSD/graphics.psd b/misc/PSD/graphics.psd deleted file mode 100644 index fc061d893..000000000 Binary files a/misc/PSD/graphics.psd and /dev/null differ diff --git a/mkrelease b/mkrelease new file mode 100755 index 000000000..db0691fdc --- /dev/null +++ b/mkrelease @@ -0,0 +1,3 @@ +#!/bin/bash +rm release/*.jar +sbt clean update package-dist diff --git a/pom.xml b/pom.xml deleted file mode 100644 index 9fef84cba..000000000 --- a/pom.xml +++ /dev/null @@ -1,154 +0,0 @@ - - 4.0.0 - - com.gravity - goose - 2.1.22 - jar - goose - http://maven.apache.org - - UTF-8 - - - - - junit - junit - 4.8.1 - test - - - org.slf4j - slf4j-api - 1.6.1 - jar - compile - - - org.slf4j - slf4j-log4j12 - 1.6.1 - test - - - org.jsoup - jsoup - 1.5.2 - - - commons-io - commons-io - 2.0.1 - - - - org.scala-lang - scala-compiler - 2.9.0-1 - compile - - - - org.scala-lang - scala-library - 2.9.0-1 - - - - org.apache.httpcomponents - httpclient - 4.1.2 - - - commons-lang - commons-lang - 2.6 - - - - - - - - org.apache.maven.plugins - maven-surefire-plugin - 2.6 - - false - true - - **/*Test.* - - - **/*IT.* - - - - - org.scala-tools - maven-scala-plugin - 2.14.3 - - UTF-8 - - -Xmx1024m - - - - - compile - - compile - - compile - - - test-compile - - testCompile - - test-compile - - - process-resources - - compile - - - - - - - org.apache.maven.plugins - maven-source-plugin - 2.1.2 - - - attach-sources - - jar - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - 2.8 - - - attach-javadocs - - jar - - - - - - - - \ No newline at end of file diff --git a/pom_scala210.xml b/pom_scala210.xml new file mode 100644 index 000000000..326d2b7ae --- /dev/null +++ b/pom_scala210.xml @@ -0,0 +1,524 @@ + + + + + 3.0.4 + + 4.0.0 + com.gravity.goose + goose_2.10 + + 2.2.8 + jar + goose + + UTF-8 + 1.5 + 1.5 + 1.7 + 2.10 + 2.10.6 + + 3.1.2.RELEASE + 1.7.1 + ${appengine.sdk.root} + 1.7.7 + + raisercostin + goose + https://github.com/${github.user}/${github.repo} + ${github.user} + maven + ${github.repo} + + ${github.url} + Html Content / Article Extractor in Scala - open sourced from Gravity Labs - http://gravity.com +Goose was originally an article extractor written in Java that has most recently (aug2011) converted to a scala project. It's mission is to take any news article or article type web page and not only extract what is the main body of the article but also all meta data and most probable image candidate. + + + The Apache Software License, Version 2.0 + repo + http://www.apache.org/licenses/LICENSE-2.0.txt + + + + scm:git:${github.url}.git + scm:git:${github.url}.git + ${github.url} + ${project.artifactId}-${project.version} + + + + + bintray + https://api.bintray.com/maven/${bintray.user}/${bintray.repo}/${bintray.package}/;publish=1 + + + + Github + https://github.com/raisercostin/${project.artifactId}/issues + + + + jiminoc + Jim Plush + Gravity Labs + http://www.gravity.com/GravityLabs + http://jimplush.com + + + erraggy + Robbie Coleman + Gravity Labs + http://www.gravity.com/GravityLabs + http://robbie.robnrob.com/ + + + raisercostin + Costin Grigore + http://raisercostin.org/ + + + + + org.slf4j + slf4j-api + ${slf4j.version} + + + + org.slf4j + slf4j-log4j12 + ${slf4j.version} + test + + + log4j + log4j + 1.2.14 + + + + commons-io + commons-io + 2.4 + + + commons-lang + commons-lang + 2.6 + + + org.apache.httpcomponents + httpclient + 4.3.3 + + + org.jsoup + jsoup + 1.7.3 + + + com.chenlb.mmseg4j + mmseg4j-core + 1.9.1 + + + com.github.nscala-time + nscala-time_${scala.prefix} + 1.4.0 + + + + org.scala-lang + scala-compiler + ${scala.version} + + + org.scala-lang + scala-library + ${scala.version} + + + + com.googlecode.juniversalchardet + juniversalchardet + 1.0.3 + + + com.ibm.icu + icu4j + 53.1 + + + junit + junit + 4.11 + test + + + + com.google.appengine + appengine-api-labs + ${gae.version} + + + com.google.appengine + appengine-api-stubs + ${gae.version} + + + com.google.appengine + appengine-testing + ${gae.version} + + + com.google.appengine + appengine-api-1.0-sdk + ${gae.version} + compile + + + + com.fasterxml.jackson.core + jackson-databind + 2.4.3 + + + com.fasterxml.jackson.module + jackson-module-scala_${scala.prefix} + 2.4.3 + + + org.simpleframework + simple + 4.1.21 + + + + + + package + ${project.basedir}/target + ${project.artifactId}-${project.version} + ${basedir}/src/main/scala + ${basedir}/src/test/scala + ${project.basedir}/target/scala-out/classes + ${project.basedir}/target/scala-out/test-classes + + + org.apache.maven.plugins + maven-surefire-plugin + 2.17 + + false + true + + **/*Test.* + + + **/*IT.* + + + + + + net.alchim31.maven + scala-maven-plugin + 3.1.6 + + + -Xmx1024m + + ${project.build.sourceEncoding} + ${scala.prefix} + ${scala.version} + + -deprecation + -feature + + + + + attach-javadocs + + doc-jar + + + + compile + + compile + + compile + + + test-compile + + testCompile + + test-compile + + + process-resources + + compile + + + + + + org.codehaus.mojo + build-helper-maven-plugin + + + add-source + generate-sources + + add-source + + + + src/main/scala + + + + + + add-test-source + generate-sources + + add-test-source + + + + src/test/scala + + + + + + + org.apache.maven.plugins + maven-eclipse-plugin + 2.8 + + false + false + + org.scala-ide.sdt.core.scalanature + org.eclipse.jdt.core.javanature + + + org.scala-ide.sdt.core.scalabuilder + + + org.scala-ide.sdt.launching.SCALA_CONTAINER + org.eclipse.jdt.launching.JRE_CONTAINER + + + + org.scala-lang:scala-library + org.scala-lang:scala-compiler + + + **/*.scala + **/*.java + + + + + org.apache.maven.plugins + maven-jar-plugin + 2.4 + + + + true + true + true + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.1 + + ${project.build.sourceEncoding} + ${source.version} + ${source.version} + + + + org.apache.maven.plugins + maven-resources-plugin + 2.6 + + ${project.build.sourceEncoding} + + + + org.apache.maven.plugins + maven-release-plugin + 2.5 + + + org.apache.maven.plugins + maven-deploy-plugin + 2.8.1 + + + + + + + com.google.code.maven-svn-wagon + maven-svn-wagon + 1.4 + + + + + + release + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.9.1 + + + attach-javadocs + + jar + + + + + + org.apache.maven.plugins + maven-source-plugin + 2.3 + + + attach-sources + + jar + + + + + + + + + diff --git a/pom_scala211.xml b/pom_scala211.xml new file mode 100644 index 000000000..b94a4dcc5 --- /dev/null +++ b/pom_scala211.xml @@ -0,0 +1,524 @@ + + + + + 3.0.4 + + 4.0.0 + com.gravity.goose + goose_2.11 + + 2.2.8 + jar + goose + + UTF-8 + 1.5 + 1.5 + 1.7 + 2.11 + 2.11.8 + + 3.1.2.RELEASE + 1.7.1 + ${appengine.sdk.root} + 1.7.7 + + raisercostin + goose + https://github.com/${github.user}/${github.repo} + ${github.user} + maven + ${github.repo} + + ${github.url} + Html Content / Article Extractor in Scala - open sourced from Gravity Labs - http://gravity.com +Goose was originally an article extractor written in Java that has most recently (aug2011) converted to a scala project. It's mission is to take any news article or article type web page and not only extract what is the main body of the article but also all meta data and most probable image candidate. + + + The Apache Software License, Version 2.0 + repo + http://www.apache.org/licenses/LICENSE-2.0.txt + + + + scm:git:${github.url}.git + scm:git:${github.url}.git + ${github.url} + ${project.artifactId}-${project.version} + + + + + bintray + https://api.bintray.com/maven/${bintray.user}/${bintray.repo}/${bintray.package}/;publish=1 + + + + Github + https://github.com/raisercostin/${project.artifactId}/issues + + + + jiminoc + Jim Plush + Gravity Labs + http://www.gravity.com/GravityLabs + http://jimplush.com + + + erraggy + Robbie Coleman + Gravity Labs + http://www.gravity.com/GravityLabs + http://robbie.robnrob.com/ + + + raisercostin + Costin Grigore + http://raisercostin.org/ + + + + + org.slf4j + slf4j-api + ${slf4j.version} + + + + org.slf4j + slf4j-log4j12 + ${slf4j.version} + test + + + log4j + log4j + 1.2.14 + + + + commons-io + commons-io + 2.4 + + + commons-lang + commons-lang + 2.6 + + + org.apache.httpcomponents + httpclient + 4.3.3 + + + org.jsoup + jsoup + 1.7.3 + + + com.chenlb.mmseg4j + mmseg4j-core + 1.9.1 + + + com.github.nscala-time + nscala-time_${scala.prefix} + 1.4.0 + + + + org.scala-lang + scala-compiler + ${scala.version} + + + org.scala-lang + scala-library + ${scala.version} + + + + com.googlecode.juniversalchardet + juniversalchardet + 1.0.3 + + + com.ibm.icu + icu4j + 53.1 + + + junit + junit + 4.11 + test + + + + com.google.appengine + appengine-api-labs + ${gae.version} + + + com.google.appengine + appengine-api-stubs + ${gae.version} + + + com.google.appengine + appengine-testing + ${gae.version} + + + com.google.appengine + appengine-api-1.0-sdk + ${gae.version} + compile + + + + com.fasterxml.jackson.core + jackson-databind + 2.4.3 + + + com.fasterxml.jackson.module + jackson-module-scala_${scala.prefix} + 2.4.3 + + + org.simpleframework + simple + 4.1.21 + + + + + + package + ${project.basedir}/target + ${project.artifactId}-${project.version} + ${basedir}/src/main/scala + ${basedir}/src/test/scala + ${project.basedir}/target/scala-out/classes + ${project.basedir}/target/scala-out/test-classes + + + org.apache.maven.plugins + maven-surefire-plugin + 2.17 + + false + true + + **/*Test.* + + + **/*IT.* + + + + + + net.alchim31.maven + scala-maven-plugin + 3.1.6 + + + -Xmx1024m + + ${project.build.sourceEncoding} + ${scala.prefix} + ${scala.version} + + -deprecation + -feature + + + + + attach-javadocs + + doc-jar + + + + compile + + compile + + compile + + + test-compile + + testCompile + + test-compile + + + process-resources + + compile + + + + + + org.codehaus.mojo + build-helper-maven-plugin + + + add-source + generate-sources + + add-source + + + + src/main/scala + + + + + + add-test-source + generate-sources + + add-test-source + + + + src/test/scala + + + + + + + org.apache.maven.plugins + maven-eclipse-plugin + 2.8 + + false + false + + org.scala-ide.sdt.core.scalanature + org.eclipse.jdt.core.javanature + + + org.scala-ide.sdt.core.scalabuilder + + + org.scala-ide.sdt.launching.SCALA_CONTAINER + org.eclipse.jdt.launching.JRE_CONTAINER + + + + org.scala-lang:scala-library + org.scala-lang:scala-compiler + + + **/*.scala + **/*.java + + + + + org.apache.maven.plugins + maven-jar-plugin + 2.4 + + + + true + true + true + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.1 + + ${project.build.sourceEncoding} + ${source.version} + ${source.version} + + + + org.apache.maven.plugins + maven-resources-plugin + 2.6 + + ${project.build.sourceEncoding} + + + + org.apache.maven.plugins + maven-release-plugin + 2.5 + + + org.apache.maven.plugins + maven-deploy-plugin + 2.8.1 + + + + + + + com.google.code.maven-svn-wagon + maven-svn-wagon + 1.4 + + + + + + release + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.9.1 + + + attach-javadocs + + jar + + + + + + org.apache.maven.plugins + maven-source-plugin + 2.3 + + + attach-sources + + jar + + + + + + + + + diff --git a/project/Build.scala-new b/project/Build.scala-new new file mode 100644 index 000000000..032697058 --- /dev/null +++ b/project/Build.scala-new @@ -0,0 +1,23 @@ +import sbt._ +import Keys._ + +object Dependencies { + val Slf4jApi = "org.slf4j" % "slf4j-api" % "1.6.6" + val Slf4jLog4j12 = "org.slf4j" % "slf4j-log4j12" % "1.6.6" + val Jsoup = "org.jsoup" % "jsoup" % "1.7.2" + val CommonsIo = "commons-io" % "commons-io" % "2.0.1" + val ScalaCompiler = "org.scala-lang" % "scala-compiler" % "2.9.2" + val ScalaLibrary = "org.scala-lang" % "scala-library" % "2.9.2" + val Httpclient = "org.apache.httpcomponents" % "httpclient" % "4.2.4" + val CommonsLang = "commons-lang" % "commons-lang" % "2.6" + val Juniversalchardet = "com.googlecode.juniversalchardet" % "juniversalchardet" % "1.0.3" + val LangidJava = "com.carrotsearch" % "langid-java" % "1.0.0" +} + +object GooseBuild extends Build { + import Dependencies._ + + lazy val root = Project("goose", file("."), + settings = Defaults.defaultSettings ++ + Seq(libraryDependencies ++= Seq(Slf4jApi, Slf4jLog4j12, Jsoup, CommonsIo, ScalaCompiler, ScalaLibrary, Httpclient, CommonsLang, Juniversalchardet, LangidJava))) +} \ No newline at end of file diff --git a/project/GooseBuild.scala-new b/project/GooseBuild.scala-new new file mode 100644 index 000000000..8535483fd --- /dev/null +++ b/project/GooseBuild.scala-new @@ -0,0 +1,91 @@ +import sbt._ +import Keys._ + + +object GooseBuild extends Build { + + lazy val goose = Project( + id = "goose", + base = file("."), + settings = Project.defaultSettings ++ Seq( + description := "Html Content / Article Extractor in Scala", + organization := "com.gravity", + version := "2.1.22-SNAPSHOT", + version <<= version { v => //only release *if* -Drelease=true is passed to JVM + val release = Option(System.getProperty("release")) == Some("true") + if (release) { + v + } else { + val suffix = Option(System.getProperty("suffix")) + val i = (v.indexOf('-'), v.length) match { + case (x, l) if x < 0 => l + case (x, l) if v substring (x + 1) matches """\d+""" => l //patch level, not RCx + case (x, _) => x + } + v.substring(0, i) + "-" + (suffix getOrElse "SNAPSHOT") + } + }, + parallelExecution := false, + publishMavenStyle := true, + scalaVersion := "2.9.2", + crossScalaVersions := Seq("2.9.2", "2.9.1", "2.9.0"), + licenses := Seq("Apache 2" -> url("http://www.apache.org/licenses/LICENSE-2.0.txt")), + homepage := Some(url("http://github.com/jaytaylor/goose")), + pomExtra := ( + + git@github.com:jaytaylor/goose.git + scm:git:git@github.com:jaytaylor/goose.git + + + + jaytaylor + Jay Taylor + https://github.com/jaytaylor + + + ), + publishTo <<= version { v => + Some(Resolver.sftp( + "Scala.sh Repository", + "scala.sh", + "/var/www/scala.sh/public_html/repositories/" + ( + if (v.trim.endsWith("SNAPSHOT")) { "snapshots" } else { "releases" } + ) + )) + }, + publishArtifact in Test := false, + pomIncludeRepository := { _ => false }, + resolvers ++= Seq( + "Sonatype Releases" at "http://oss.sonatype.org/content/repositories/releases", + "JBoss Repository" at "http://repository.jboss.org/nexus/content/groups/public", + "CodaHale Repository" at "http://repo.codahale.com", + "Scala.sh Releases" at "http://scala.sh/repositories/releases", + "Scala.sh Snapshots" at "http://scala.sh/repositories/snapshots", + "Maven1" at "http://repo1.maven.org/maven2", + "Typesafe Artifactory" at "http://typesafe.artifactoryonline.com/typesafe/repo", + "Typesafe Releases" at "http://repo.typesafe.com/typesafe/releases", + "iBiblio Maven2" at "http://mirrors.ibiblio.org/maven2" + ), + libraryDependencies ++= Seq( + "junit" % "junit" % "4.8.1", + "org.slf4j" % "slf4j-api" % "1.6.1", + "org.slf4j" % "slf4j-log4j12" % "1.6.1", + "org.jsoup" % "jsoup" % "1.5.2", + "commons-io" % "commons-io" % "2.0.1", + "org.apache.httpcomponents" % "httpclient" % "4.1.2", + "commons-lang" % "commons-lang" % "2.6" + ), + libraryDependencies <++= scalaVersion { sv => + Seq( + "org.scala-lang" % "scalap" % sv, + if (sv startsWith "2.9") { + "org.scalatest" % "scalatest_2.9.1" % "1.6.1" % "test" + } else { + "org.scalatest" % "scalatest_2.8.2" % "1.5.1" % "test" + } + ) + } + ) + ) +} + diff --git a/project/build.properties b/project/build.properties new file mode 100644 index 000000000..8cbb5226c --- /dev/null +++ b/project/build.properties @@ -0,0 +1 @@ +sbt.version=0.13.0 \ No newline at end of file diff --git a/project/plugins.sbt b/project/plugins.sbt new file mode 100644 index 000000000..7a3c53b85 --- /dev/null +++ b/project/plugins.sbt @@ -0,0 +1,23 @@ +resolvers ++= Seq( + //"releases" at "http://scala-tools.org/repo-releases", + //"umeng.com releases" at "http://122.11.52.227:8088/nexus/content/repositories/releases", + "Oracle Maven 2 Repository" at "http://download.oracle.com/maven", + "JBoss Maven 2 Repository" at "http://repository.jboss.com/maven2", + "sbt-plugin-releases" at "http://repo.scala-sbt.org/scalasbt/sbt-plugin-releases", + "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/" +) + +//addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") + +//addSbtPlugin("com.twitter" % "sbt-package-dist" % "1.0.6") + +//addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.2.1") +addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.3.0") + +addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.4") + +// Comment to get more information during initialization +logLevel := Level.Warn + +// Use the Play sbt plugin for Play projects +addSbtPlugin("com.typesafe.play" % "sbt-plugin" % "2.2.1") \ No newline at end of file diff --git a/src/main/resources/com/gravity/goose/images/known-image-css.txt b/src/main/resources/com/gravity/goose/images/known-image-css.txt index d9ffd3526..e8e444ca9 100644 --- a/src/main/resources/com/gravity/goose/images/known-image-css.txt +++ b/src/main/resources/com/gravity/goose/images/known-image-css.txt @@ -1,5 +1,5 @@ latimes.com^thumbnail -cnn.com^storytext|cnn_strycntntlft +cnn.com^storytext foxnews.com^entry-content msn.com^articleText go.com^mediaimage diff --git a/src/main/resources/com/gravity/goose/statichtml/allImages.txt b/src/main/resources/com/gravity/goose/statichtml/allImages.txt new file mode 100644 index 000000000..a5e2d8cc7 --- /dev/null +++ b/src/main/resources/com/gravity/goose/statichtml/allImages.txt @@ -0,0 +1,986 @@ + + + Understanding StatsD and Graphite - pkhamre.blog + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

pkhamre.blog

+ +

thoughts, devops, tools and stuff.

+ +
+
+ +
+ +
+
+
+
+ +
+ + +

Understanding StatsD and Graphite

+ + + +

+ + + + + + + + + + + + + + +

+ +
+ + +

+ +

After a short conversation with BryanWB_ on the #logstash channel at Freenode, +I realized that I did not know how my data was sent and how it was stored in Graphite. +I knew that StatsD collects and aggregates my metrics. And I knew that StatsD +ships them off to Graphite. Which I knew stores the time-series data and +enables us to render graphs based on these data.

+ +

What I did not know was if my http-access graphs displayed requests per second, +average requests per retention or anything else.

+ +

It was time to research how these things worked in order to get a complete +understanding.

+ +

StatsD

+ +

To get a full understanding of how StatsD works, I started to read the source +code. I knew StatsD was a simple application, but I did not knew it was this +simple. Just over 300 lines of code in the main script and around 150 +lines in the graphite backend code.

+ +

Concepts in StatsD

+ +

StatsD has a few concepts listed in the documentation that should be +understood.

+ +

Buckets

+ +

Each stat is in its own “bucket”. They are not predefined anywhere. Buckets can be +named anything that will translate to Graphite (periods make folders, etc)

+ +

Values

+ +

Each stat will have a value. How it is interpreted depends on modifiers. In general +values should be integer.

+ +

Flush interval

+ +

After the flush interval timeout (default 10 seconds), stats are aggregated and +sent to an upstream backend service.

+ +

Metric types

+ +

Counters

+ +

Counters are simple. It adds a value to a bucket and stays in memory until the +flush interval.

+ +

Lets take a look at the source code that generates the counter stats that gets +flushed to the backend.

+ +
1
+2
+3
+4
+5
+6
+7
+8
+9
+
for (key in counters) {
+  var value = counters[key];
+  var valuePerSecond = value / (flushInterval / 1000); // calculate "per second" rate
+
+  statString += 'stats.'        + key + ' ' + valuePerSecond + ' ' + ts + "\n";
+  statString += 'stats_counts.' + key + ' ' + value          + ' ' + ts + "\n";
+
+  numStats += 1;
+}
+
+ + +

First, StatsD iterates over any counters received, where it starts by assigning +two variables. One variable holds the counter value, and one variable +holds the per-second value. It then adds the values to the statString and +increases the numStats variable.

+ +

If you have the default flush interval, 10 seconds, and send StatsD +7 increments on a counter with the flush interval, the counter would be 7 and +the per-second value would be 0.7. No magic.

+ +

Timers

+ +

Timers collects numbers. They does not necessarily need to contain a value of +time. You can collect bytes read, number of objects in some storage, or +anything that is a number. A good thing about timer, is that you get the mean, +the sum, the count, the upper and the lower values for free. Feed StatsD +a timer and this gets automatically calculated for you before it is flushed to +Graphite. Oh, I almost forgot to mention that you also get the 90 percentile +calculated for the mean, sum and upper values as well. You can also configure +StatsD to use an array of numbers as percentiles, which means you can get 50 +percentile, 90 percentile and 95 percentile calculated for you if you want.

+ +

The source code for timer stats is a bit more advanced than the code for the +counters.

+ +
1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+
for (key in timers) {
+  if (timers[key].length > 0) {
+    var values = timers[key].sort(function (a,b) { return a-b; });
+    var count = values.length;
+    var min = values[0];
+    var max = values[count - 1];
+
+    var cumulativeValues = [min];
+    for (var i = 1; i < count; i++) {
+        cumulativeValues.push(values[i] + cumulativeValues[i-1]);
+    }
+
+    var sum = min;
+    var mean = min;
+    var maxAtThreshold = max;
+
+    var message = "";
+
+    var key2;
+
+    for (key2 in pctThreshold) {
+      var pct = pctThreshold[key2];
+      if (count > 1) {
+        var thresholdIndex = Math.round(((100 - pct) / 100) * count);
+        var numInThreshold = count - thresholdIndex;
+
+        maxAtThreshold = values[numInThreshold - 1];
+        sum = cumulativeValues[numInThreshold - 1];
+        mean = sum / numInThreshold;
+      }
+
+      var clean_pct = '' + pct;
+      clean_pct.replace('.', '_');
+      message += 'stats.timers.' + key + '.mean_'  + clean_pct + ' ' + mean           + ' ' + ts + "\n";
+      message += 'stats.timers.' + key + '.upper_' + clean_pct + ' ' + maxAtThreshold + ' ' + ts + "\n";
+      message += 'stats.timers.' + key + '.sum_' + clean_pct + ' ' + sum + ' ' + ts + "\n";
+    }
+
+    sum = cumulativeValues[count-1];
+    mean = sum / count;
+
+    message += 'stats.timers.' + key + '.upper ' + max   + ' ' + ts + "\n";
+    message += 'stats.timers.' + key + '.lower ' + min   + ' ' + ts + "\n";
+    message += 'stats.timers.' + key + '.count ' + count + ' ' + ts + "\n";
+    message += 'stats.timers.' + key + '.sum ' + sum  + ' ' + ts + "\n";
+    message += 'stats.timers.' + key + '.mean ' + mean + ' ' + ts + "\n";
+    statString += message;
+
+    numStats += 1;
+  }
+}
+
+ + +

StatsD iterates over each timer and processes the timer if the value is above +0. It then sorts the array of values and simply counts it and locates the +minimum and maximum values. An array of the cumulative values is created and +a few variables are assigned before it starts to iterate over the percentile +thresholds array to calculate percentiles and creates the messages to assign to +the statString variable. When percentile calculation is done, the final sum +gets assigned and the final statString is created.

+ +

If you send the following timer values to StatsD during the default flush interval

+ +
    +
  • 450
  • +
  • 120
  • +
  • 553
  • +
  • 994
  • +
  • 334
  • +
  • 844
  • +
  • 675
  • +
  • 496
  • +
+ + +

StatsD will calculate the following values

+ +
    +
  • mean_90 496
  • +
  • upper_90 844
  • +
  • sum_90 3472
  • +
  • upper 994
  • +
  • lower 120
  • +
  • count 8
  • +
  • sum 4466
  • +
  • mean 558.25
  • +
+ + +

Gauges

+ +

A gauge simply indicates an arbitrary value at a point in time and is the most +simple type in StatsD. It just takes any number and ships it to the backend.

+ +

The source code for gauge stats is just four lines.

+ +
1
+2
+3
+4
+
for (key in gauges) {
+  statString += 'stats.gauges.' + key + ' ' + gauges[key] + ' ' + ts + "\n";
+  numStats += 1;
+}
+
+ + +

Feed StatsD a number and it sends it unprocessed to the backend. A thing to +note is that only the last value of a gauge during a flush interval is flushed +to the backend. That means that if you send the following gauge values to +StatsD during a flush interval

+ +
    +
  • 643
  • +
  • 754
  • +
  • 583
  • +
+ + +

The only value that gets flushed to the backend is 583. The value of this +gauge will be kept in memory in StatsD and be sent to the backend at the end of +every flush interval.

+ +

Graphite

+ +

Now that we know how our data is sent from StatsD, lets take a look at how it +is stored and processed in Graphite.

+ +

Overview

+ +

In the Graphite documentation we can find the Graphite overview. It sums +up Graphite with these two simple points.

+ +
    +
  • Graphite stores numeric time-series data.
  • +
  • Graphite renders graphs of this data on demand.
  • +
+ + +

Graphite consists of three parts.

+ +
    +
  • carbon - a daemon that listens for time-series data.
  • +
  • whisper - a simple database library for storing time-series data.
  • +
  • webapp - a (Django) webapp that renders graphs on demand.
  • +
+ + +

The format for time-series data in graphite looks like this

+ +
1
+
<key> <numeric value> <timestamp>
+ + +

Storage schemas

+ +

Graphite uses configurable storage schemas too define retention rates for +storing data. It matches data paths with a pattern and tells what frequency and +history for our data to store.

+ +

The following configuration example is taken from the StatsD documentation.

+ +
1
+2
+3
+
[stats]
+pattern = ^stats\..*
+retentions = 10:2160,60:10080,600:262974
+ + +

Which means these retentions will be used for every entry with a key matching +the pattern defined. The retention format is frequency:history. So this +configuration lets us store 10 second data for 6 hours, 1 minute data for +1 week, and 10 minute data for 5 years.

+ +

Visualizing a timer in Graphite

+ +

Knowing all this, we can now take a look at my simple ruby-script that collects +timings for a HTTP requests.

+ +
1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+
#!/usr/bin/env ruby
+
+require 'rubygems' if RUBY_VERSION < '1.9.0'
+require './statsdclient.rb'
+require 'typhoeus'
+
+Statsd.host = 'localhost'
+Statsd.port = 8125
+
+def to_ms time
+  (1000 * time).to_i
+end
+
+while true
+  start_time = Time.now.to_f
+
+  resp = Typhoeus::Request.get 'http://www.example.org/system/information'
+
+  end_time = Time.now.to_f
+
+  elapsed_time = (1000 * end_time) - (to_ms start_time)
+  response_time = to_ms resp.time
+  start_transfer_time = to_ms resp.start_transfer_time
+  app_connect_time = to_ms resp.app_connect_time
+  pretransfer_time = to_ms resp.pretransfer_time
+  connect_time = to_ms resp.connect_time
+  name_lookup_time = to_ms resp.name_lookup_time
+
+  Statsd.timing('http_request.elapsed_time', elapsed_time)
+  Statsd.timing('http_request.response_time', response_time)
+  Statsd.timing('http_request.start_transfer_time', start_transfer_time)
+  Statsd.timing('http_request.app_connect_time', app_connect_time)
+  Statsd.timing('http_request.pretransfer_time', pretransfer_time)
+  Statsd.timing('http_request.connect_time', connect_time)
+  Statsd.timing('http_request.name_lookup_time', name_lookup_time)
+
+  sleep 10
+end
+
+ + +

Lets take a look at the visualized Graphite render from this data. The data is +from the last 2 minutes, and the elapsed_time target from our script above.

+ +

Image visualization

+ +
Render URL
+ +

Render URL used for the image below.

+ +
1
+
/render/?width=586&height=308&from=-2minutes&target=stats.timers.http_request.elapsed_time.sum
+ + +
Rendered image from Graphite
+ +

Rendered image from Graphite, a simple graph visualizing elapsed_time for http +requests over time.

+ +

+ +

JSON-data

+ +
Render URL
+ +

Render URL used for the JSON-data below.

+ +
1
+
/render/?width=586&height=308&from=-2minutes&target=stats.timers.http_request.elapsed_time.sum&format=json
+ + +
JSON-output from Graphite
+ +

In the results below, we can see the raw data from Graphite. It is data from 12 +different data points which means 2 minutes with the StatsD 10-second flush +interval. It is really this simple, Graphite just visualizes its data.

+ +

The JSON-data is beautified with JSONLint for viewing purposes.

+ +
1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+
[
+    {
+        "target": "stats.timers.http_request.elapsed_time.sum",
+        "datapoints": [
+            [
+                53.449951171875,
+                1343038130
+            ],
+            [
+                50.3916015625,
+                1343038140
+            ],
+            [
+                50.1357421875,
+                1343038150
+            ],
+            [
+                39.601806640625,
+                1343038160
+            ],
+            [
+                41.5263671875,
+                1343038170
+            ],
+            [
+                34.3974609375,
+                1343038180
+            ],
+            [
+                36.3818359375,
+                1343038190
+            ],
+            [
+                35.009033203125,
+                1343038200
+            ],
+            [
+                37.0087890625,
+                1343038210
+            ],
+            [
+                38.486572265625,
+                1343038220
+            ],
+            [
+                45.66064453125,
+                1343038230
+            ],
+            [
+                null,
+                1343038240
+            ]
+        ]
+    }
+]
+
+ + +

Visualizing a gauge in Graphite

+ +

The following simple script ships a gauge to StatsD, simulating a number of +user registrations.

+ +
1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+
#!/usr/bin/env ruby
+
+require './statsdclient.rb'
+
+Statsd.host = 'localhost'
+Statsd.port = 8125
+
+user_registrations = 1
+
+while true
+  user_registrations += Random.rand 128
+
+  Statsd.gauge('user_registrations', user_registrations)
+
+  sleep 10
+end
+
+ + +

Image visualization - Number of user registrations

+ +
Render URL
+ +

Render URL used for the image below.

+ +
1
+
/render/?width=586&height=308&from=-20minutes&target=stats.gauges.user_registrations
+ + +
Rendered image from Graphite
+ +

Another simple graph, just showing the total number of registrations.

+ +

+ +

Image visualization - Number of user registrations per minute

+ +

By using the derivative-function in Graphite, we can get the number of user +registrations per minute.

+ +
Render URL
+ +

Render URL used for the image below.

+ +
1
+
/render/?width=586&height=308&from=-20minutes&target=derivative(stats.gauges.user_registrations)
+ + +
Rendered image from Graphite
+ +

A graph based on the same data as above, but with the derivative function +applied to visualize a per-minute rate.

+ +

+ +

Conclusion

+ +

Knowing more about how StatsD and Graphite works, it will be alot easier to +know what kind of data to ship StatsD, to know how to ship the data to StatsD, +and to know how to read the data from Graphite.

+ +

Got any comments or questions? Let me know in the comment section below.

+
+ + + +
+ +
+

Comments

+
+
+ +
+ + + + +
+
+ + + + + + + +
+ + + + + + + + + + + + + + + + +
diff --git a/src/main/resources/com/gravity/goose/text/stopwords-all.txt b/src/main/resources/com/gravity/goose/text/stopwords-all.txt new file mode 100644 index 000000000..9841ae222 --- /dev/null +++ b/src/main/resources/com/gravity/goose/text/stopwords-all.txt @@ -0,0 +1,6197 @@ + + +# forms of ser, to be (not including the infinitive): +# translated +#----------------------------------------------------------------------- +/DIE +Ab +Aber +Abgeordneten +Alle +Allerdings +Als +Alter +Am +Amt +An +Anfang +Angaben +Antrag +April +Arbeit +Art +Artikel +Auch +Auf +Aufgabe +Augen +August +Aus +Außerdem +Bad +Band +Bau +Bayern +Bedeutung +Beginn +Begriff +Bei +Beifall +Beim +Beispiel +Bereich +Bericht +Berliner +Bevölkerung +Bild +Bilder +Bis +Blick +Buch +Bundesregierung +BÜNDNIS +Bürger +Bürgermeister +CDU +CDU/CSU +China +DM +Da +Dabei +Damit +Dann +Das +Daten +Dazu +Den +Denn +Der +Deshalb +Deutsche +Deutschen +Deutschland +Dezember +Die +Dienstag +Dies +Diese +Dieser +Dieses +Doch +Donnerstag +Dort +Dr +Dr. +Druck +Du +Durch +Ein +Eine +Einsatz +Einwohner +Eltern +Ende +Entscheidung +Entwicklung +Er +Erfolg +Ergebnis +Erst +Es +Euro +Europa +Europäischen +FC +Fall +Familie +Februar +Fenster +Film +Firma +Folge +Form +Frage +Fragen +Frankfurt +Frankfurter +Frankreich +Franz +Frau +Frauen +Freitag +Friedrich +Für +GRÜNEN +Gebiet +Geld +Gemeinde +Gemeinden +Geschichte +Gesellschaft +Gesetz +GmbH +Gott +Grund +Gruppe +Grünen +Hamburg +Hand +Hans +Haus +Hause +Heinrich +Herr +Herren +Herrn +Heute +Hier +Hilfe +Hälfte +Höhe +ISBN +Ich +Ihnen +Ihr +Ihre +Im +In +Informationen +Interesse +Internet +Ist +Italien +Ja +Jahr +Jahre +Jahren +Jahres +Jahrhundert +Jahrhunderts +Januar +Jetzt +Johann +John +Juli +Juni +Kampf +Karl +Karriere +Kilometer +Kind +Kinder +Kirche +Klaus +Kollegen +Kommission +Kopf +Kosten +Krieg +Kritik +Kultur +Kunst +Köln +König +Lage +Land +Landes +Leben +Leute +Liebe +Liste +Literatur +London +Länder +Ländern +Mai +Mal +Man +Mann +Mannheim +Mannschaft +Mark +Markt +Martin +Maßnahmen +Meine +Meinung +Menschen +Meter +Michael +Milliarden +Millionen +Minuten +Mit +Mitarbeiter +Mitglied +Mitglieder +Mitte +Mittel +Mittwoch +Monate +Monaten +Montag +Morgen +Musik +Mutter +Männer +März +Möglichkeit +München +Nach +Nachdem +Nacht +Name +Namen +Neben +Nein +Neue +New +Nicht +Noch +Norden +November +Nr. +Nun +Nur +Nähe +Oktober +Opfer +Ort +Osten +PDS +Paris +Parlament +Partei +Paul +Personen +Peter +Platz +Politik +Politiker +Polizei +Preis +Problem +Probleme +Programm +Prozent +Präsident +Punkt +Quellen +Rahmen +Rat +Raum +Recht +Regel +Regie +Regierung +Region +Reihe +Richtung +Rolle +SPD +Sache +Saison +Samstag +Schon +Schule +Schweiz +Schweizer +Sein +Seine +Seit +Seite +Seiten +September +Sicherheit +Sie +Siehe +Situation +So +Sohn +Soldaten +Sommer +Sonntag +Spiel +Spiele +Spieler +Sprache +St. +Staat +Staaten +Stadt +Stelle +Straße +Stunden +Stuttgart +System +Tag +Tage +Tagen +Team +Teil +Tel. +The +Thema +Thomas +Titel +Tochter +Tod +Trainer +USA +Uhr +Um +Und +Union +Universität +Unter +Unternehmen +Unterstützung +Vater +Verein +Verfahren +Verfügung +Verlag +Viele +Von +Vor +Wahl +Was +Wasser +Weblinks +Weg +Weise +Weitere +Welt +Wenn +Wer +Werk +Werke +Westen +Wie +Wien +Wilhelm +Wir +Wirtschaft +Woche +Wochen +Wolfgang +Wort +Während +York +Zahl +Zeit +Zeitung +Ziel +Zu +Zukunft +Zum +Zur +Zusammenarbeit +Zusammenhang +a +a +a +a's +aan +ab +abad +abans +abban +abbia +abbiamo +abbiano +abbiate +aber +abia +able +about +above +aby +acara +according +accordingly +acea +aceasta +aceea +aceeasi +aceh +aceia +acel +acela +acelasi +acelea +acerca +acest +acesta +aceste +acestea +acestei +acestia +acestui +ach +acht +acolo +across +actually +acum +acz +aczkolwiek +ací +ad +ada +adalah +adanya +adapun +adica +af +after +afterwards +again +against +agak +agaknya +agama +agar +agl +agl +agli +agli +agora +agustus +ah +ahhoz +ahogy +ahol +ai +ai +ai +aia +aici +aie +aient +aies +ain't +air +ait +aiurea +així +això +aj +akan +akankah +akhir +akhiri +akhirnya +aki +akibat +akik +akkor +aku +akulah +al +al +al +ala +alam +alatt +alaturi +albo +album +aldrig +ale +aleshores +ależ +algmas +algo +algun +alguna +algunas +algunes +algunos +alguns +alguns +alhora +ali +all +all +alla +alla +alla +alle +alle +alle +allein +allem +allen +aller +allerdings +alles +allo +allo +allow +allows +allt +alltför +alltid +allvarlig +allvarligt +allà +allí +allò +almost +alone +along +already +als +als +also +alt +alta +altceva +alte +alte +alten +altfel +although +alti +altii +altra +altre +altres +altul +always +am +am +amat +amatlah +amb +ambdues +ambdós +ambos +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +amerika +ami +amikor +amit +amolyan +among +amongst +amp +amíg +an +anak +anche +anche +and +and +anda +andalah +ander +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders +andet +andra +andre +ange +anggota +ani +annak +annars +another +ansiosta +antar +antara +antarabangsa +antaranya +ante +antes +antingen +anume +använda +användbar +använder +användning +används +any +anybody +anyhow +anyone +anything +anyway +anyways +anywhere +ao +aos +apa +apaan +apabila +apakah +apalagi +apart +apatah +api +apoi +apontar +appear +appreciate +appropriate +april +aquela +aquelas +aquele +aqueles +aquell +aquella +aquelles +aquells +aquest +aquesta +aquestes +aquests +aqui +aquilo +aquí +ar +arbeid +arbeiten +are +are +aren't +around +arra +arról +artikel +artinya +as +as +asa +asal +asalkan +asas +asemenea +asia +aside +asing +ask +asking +associated +associerad +asta +astazi +astfel +asupra +at +at +atare +atas +atau +ataukah +ataupun +ati +atit +atita +atitea +atitia +atrás +att +atunci +até +au +au +auch +auf +aufgrund +aura +aurai +auraient +aurais +aurait +auras +aurez +auriez +aurions +aurons +auront +aus +australia +aux +außerdem +av +avaient +available +avais +avait +avea +avec +avem +avemmo +avendo +avesse +avessero +avessi +avessimo +aveste +avesti +avete +aveva +avevamo +avevano +avevate +avevi +avevo +avez +aviez +avions +avons +avra +avrai +avranno +avrebbe +avrebbero +avrei +avremmo +avremo +avreste +avresti +avrete +avro +avrà +avrò +avut +avuta +avute +avuti +avuto +awal +awalnya +awam +away +awfully +ayant +ayez +ayons +az +azi +azok +azon +azonban +azt +aztán +azután +azzal +azért +aż +b +ba +badan +bagai +bagaikan +bagaimana +bagaimanakah +bagaimanapun +bagainamakah +bagi +bagian +bahagian +bahan +baharu +bahasa +bahawa +bahkan +bahwa +bahwasannya +bahwasanya +baik +baiknya +baix +bakal +bakalan +bakom +bald +balik +bandar +bangsa +bank +banyak +bapak +bara +barang +barangan +barat +bardziej +bardzo +bare +baru +baru-baru +bawah +be +be +beberapa +became +because +become +becomes +becoming +bedeutet +been +befindet +before +beforehand +begann +begge +begini +beginian +beginikah +beginilah +begitu +begitukah +begitulah +begitupun +behind +behov +behöver +bei +beide +beiden +beim +being +beispielsweise +bekannt +bekas +bekerja +bekommen +belakang +belakangan +belanda +beli +beliau +believe +below +belum +belumlah +belül +bem +benar +benarkah +benarlah +benne +bentuk +berada +berakhir +berakhirlah +berakhirnya +berapa +berapakah +berapalah +berapapun +berarti +berasal +berat +berawal +berbagai +berbanding +berbeda +berdasarkan +berdatangan +bereit +bereits +berharap +berhasil +beri +berikan +berikut +berikutan +berikutnya +berita +berjalan +berjaya +berjumlah +berkaitan +berkali +berkali-kali +berkata +berkehendak +berkeinginan +berkenaan +berlainan +berlaku +berlalu +berlangsung +berlebihan +bermacam +bermacam-macam +bermain +bermaksud +bermula +bernama +bernilai +bersama +bersama-sama +bersiap +bertanya +bertemu +berturut +bertutur +berubah +berujar +berupa +berätta +besar +beside +besides +beskrivits +besok +besonders +besser +best +besteht +besten +better +betul +betulkah +between +betyda +bevor +beyond +bezeichnet +bhd +biasa +biasanya +bidang +bietet +bij +bila +bilakah +bilion +bin +bine +bintang +bis +bisa +bisakah +bisher +bist +bland +ble +bleiben +bleibt +blev +bli +blieb +blir +blog +bn +bo +bola +boleh +bolehkah +bolehlah +bom +borde +bort +borta +bortom +both +bowiem +bra +bredvid +brief +bringen +brinner +bruke +buat +bukan +bukankah +bukanlah +bukannya +buku +bulan +bumi +bung +bursa +but +by +by +byli +bynajmniej +być +był +była +było +były +bzw. +bár +bäst +bättre +både +bör +bör inte +będzie +będą +c +c +c +c'mon +c'mon +c's +c: s +ca +ca. +cada +cada +cadangan +cadascuna +cadascunes +cadascuns +cadascú +cali +cam +came +caminho +campaign +can +can't +cannot +cant +cant +capat +cara +caranya +care +careia +carora +caruia +catch +catre +cause +causes +cała +cały +ce +ce +cea +ceci +ceea +cei +ceilalti +cel +cele +celor +celà +certain +certainly +ces +cet +cette +ceva +changes +che +che +chi +chi +chiar +china +ci +ci +ci +ciebie +cikk +cikkek +cikkeket +cima +cind +cine +cineva +cit +cita +cite +citera +citeva +citi +citiva +cię +clearly +click +co +co +code +coi +coi +cokolwiek +col +col +com +com +come +come +comes +como +comprido +con +con +concerning +conform +conhecido +consequently +consider +considering +contain +containing +contains +contra +contra +contro +contro +copyright +corrente +corresponding +could +couldn't +course +coś +criticized +csak +cu +cual +cuando +cui +cui +cui +cukup +cukupkah +cukuplah +cum +cuma +cumva +currently +czasami +czasem +czemu +czy +czyli +d +d +d'un +d'una +d'unes +d'uns +da +da +da +dabei +daca +dadurch +daerah +dafür +dagangan +dagegen +dagl +dagl +dagli +dagli +daher +dahulu +dai +dai +dal +dal +dalam +daleko +dall +dall +dalla +dalla +dalle +dalle +dallo +dallo +dalt +damals +damit +dan +dana +danach +dann +dans +dapat +dar +daran +darauf +darf +dari +darin +daripada +darunter +darüber +das +dasar +dass +dasselbe +dat +data +datang +datuk +davon +dazu +daß +de +de +de hade +de har +de kommer +de är +deasupra +debaixo +deci +decit +definitely +definitivt +deg +degl +degl +degli +degli +degraba +dei +dei +dei +dein +deine +deinem +deinen +deiner +deines +deja +dekat +del +del +del +dela +delas +dele +deles +dell +dell +della +della +delle +delle +dello +dello +dels +dem +dem +demi +demikian +demikianlah +demselben +den +den +denen +dengan +denn +denne +denselben +dentro +depan +depois +der +der +deras +dere +deren +derer +deres +derivatives +derselbe +derselben +derzeit +des +des +desa +described +desde +desember +deshalb +desi +desligado +despite +despre +després +dess +dessa +desselben +dessen +dessutom +det +det andra +det finns +det ska +det skulle +det är +detik +detta +dette +deutlich +deutsche +deutschen +deutscher +deve +devem +deverá +dewan +di +di +di +dia +diadakan +diakhiri +diakhirinya +dialah +dianggap +diantara +diantaranya +diberi +diberikan +diberikannya +dibuat +dibuatnya +dibuka +dicatatkan +dich +did +didapat +didatangkan +didirikan +didn't +diduga +die +die +dies +diese +dieselbe +dieselben +diesem +diesen +dieser +dieses +different +dig +digunakan +diibaratkan +diibaratkannya +diingat +diingatkan +diinginkan +dijangka +dijawab +dijelaskan +dijelaskannya +dikarenakan +dikatakan +dikatakannya +dikenal +dikerjakan +diketahui +diketahuinya +dikira +dilakukan +dilalui +dilihat +dimaksud +dimaksudkan +dimaksudkannya +dimaksudnya +dimana +diminta +dimintai +dimisalkan +dimulai +dimulailah +dimulainya +dimungkinkan +din +dini +diniagakan +dins +dintr +dintre +dipastikan +diperbuat +diperbuatnya +dipergunakan +diperkirakan +diperlihatkan +diperlukan +diperlukannya +dipersoalkan +dipertanyakan +dipunyai +dir +direita +direkt +diri +dirilis +dirinya +dis +disampaikan +disebut +disebutkan +disebutkannya +disember +disini +disinilah +disse +distrik +dit +ditambahkan +ditandaskan +ditanya +ditanyai +ditanyakan +ditegaskan +ditemukan +ditt +ditujukan +ditunjuk +ditunjuki +ditunjukkan +ditunjukkannya +ditunjuknya +ditutup +dituturkan +dituturkannya +diucapkan +diucapkannya +diungkapkan +diz +dizer +dla +dlaczego +dlatego +do +do +doar +dobrze +doch +dock +document.write +does +doesn't +dog +doing +dois +dokąd +dolar +don't +donat +doncs +donde +done +dong +dort +dos +dov +dov +dove +dove +down +downwards +dość +dpa +dr +drei +du +du +du har +du skulle +du är +dua +dulu +dunia +dupa +durant +durante +durch +during +dużo +dwa +dwaj +dwie +dwoje +dzisiaj +dziś +där +därefter +därför +däri +därifrån +därigenom +därpå +då +dürfen +e +e +e +ea +each +ebbe +ebben +ebbero +ebbi +eben +ebenfalls +ebenso +ed +ed +eddig +edu +edu +een +effective +efter +eftersom +efteråt +ega +egen +egy +egyes +egyetlen +egyik +egyre +egyéb +egész +eh +ehemaligen +eher +ehhez +ehkä +ei +eigene +eigenen +eigentlich +eight +ein +eine +einem +einen +einer +eines +einfach +eingesetzt +einig +einige +einigem +einigen +einiger +einiges +einmal +einzelnen +einzige +either +ej +ekkor +ekonomi +eksekutif +eksport +el +el +ela +elas +ele +ele +electronic +eles +ella +ellas +elle +ellen +eller +ellos +els +else +elsewhere +első +elég +elő +először +előtt +em +em +emilyen +empat +en +en +en annan +en gång +enam +encara +end +endast +endorsed +ene +enemmän +eneste +enggak +enggaknya +enhver +enligt +enn +ennek +ennen +enough +enquanto +ens +ensam +entah +entahlah +entirely +entre +entre +entwickelt +então +er +er +era +era +erais +eram +eran +erano +eras +eravamo +eravate +eren +eres +erhalten +erhielt +eri +erklärt +erklärte +erneut +ero +eropa +err +erre +erreichen +erreicht +erst +erste +ersten +erster +erstmals +es +es +esa +esas +ese +eso +esos +especially +essa +essas +esse +essendo +esses +est +esta +esta +estaba +estabais +estaban +estabas +estad +estada +estadas +estado +estados +estamos +estando +estar +estaremos +estará +estarán +estarás +estaré +estaréis +estaría +estaríais +estaríamos +estarían +estarías +estas +estava +estavam +estaven +este +este +esteja +estejam +estejamos +estemos +estes +esteu +esteve +estive +estivemos +estiver +estivera +estiveram +estiverem +estivermos +estivesse +estivessem +estivéramos +estivéssemos +esto +estos +estou +estoy +estuve +estuviera +estuvierais +estuvieran +estuvieras +estuvieron +estuviese +estuvieseis +estuviesen +estuvieses +estuvimos +estuviste +estuvisteis +estuviéramos +estuviésemos +estuvo +està +estàvem +estàveu +está +estábamos +estáis +están +estás +estávamos +estão +esté +estéis +estén +estés +et +et +etc +etc +etessa +ets +ett +etter +ettor +etwa +etwas +eu +eu +euch +eue +euer +eues +eure +eurem +euren +eurent +eurer +eures +europäischen +eus +eusse +eussent +eusses +eussiez +eussions +eut +eux +even +ever +every +everybody +everyone +everything +everywhere +ex +ex +exact +exactly +exakt +example +except +exempel +ez +ezek +ezen +ezt +ezzel +ezért +eûmes +eût +eûtes +f +faccia +facciamo +facciano +facciate +faccio +face +facemmo +facendo +facesse +facessero +facessi +facessimo +faceste +facesti +faceva +facevamo +facevano +facevate +facevi +facevo +faedah +fai +faktiskt +fand +fanno +far +fara +farai +faranno +farebbe +farebbero +farei +faremmo +faremo +fareste +faresti +farete +farà +fará +farò +fast +fata +faz +fazer +fazia +fd +feb +fece +fecero +feci +fel +felé +fem +femte +fest +few +fez +fi +fick +fie +fifth +film +fim +financial +finansiella +finden +findet +fins +fire +first +five +flera +flere +fleste +foarte +foi +folgenden +folk +followed +following +follows +fomos +for +for +fora +fora +foram +fordi +forem +former +formerly +formos +forrige +forsÛke +fortfarande +forth +fosse +fossem +fossero +fossi +fossimo +fost +foste +fosti +four +fra +fram +framåt +frei +from +frågar +från +früher +fu +fue +fuera +fuerais +fueran +fueras +fueron +fuese +fueseis +fuesen +fueses +fui +fuimos +fuiste +fuisteis +fummo +furent +furono +further +furthermore +fus +fusse +fussent +fusses +fussiez +fussions +fut +fuéramos +fuésemos +fyra +fÅ +fÛr +fÛrst +få +fått +fôramos +fôssemos +följaktligen +följer +följt +för +förhand +förhoppningsvis +förmodligen +förnuftig +först +försök +försöker +försökte +förutom +förändringar +før +først +fûmes +fût +fûtes +führen +führt +führte +fünf +für +g +gab +gairebé +gammal +ganska +ganz +ganze +ganzen +gar +gat +gdy +gdyby +gdyż +gdzie +gdziekolwiek +gdzieś +geaba +geben +gebracht +gedung +gefunden +gegeben +gegen +gegenüber +gegründet +gehen +geht +gehören +gehört +gehörte +gekommen +gelar +gemacht +gemeinsam +genannt +genau +genom +genommen +genug +ger +gerade +gesagt +gesehen +gestellt +gestern +get +gets +getting +gettracker +gewann +gewesen +geworden +gibt +gick +gillade +gilt +ging +given +given +gives +gjorde +gjort +gjÛre +gleich +gleichen +gleichzeitig +gli +gli +global +go +god +godkändes +goes +going +gone +got +gotten +greetings +große +großen +großer +grundlig +grundligt +grup +größte +größten +guna +gunakan +gunung +gut +gute +guten +gÅ +gäller +gå +går +gör +göra +h +ha +ha +hab +habe +haben +habida +habidas +habido +habidos +habiendo +habremos +habrá +habrán +habrás +habré +habréis +habría +habríais +habríamos +habrían +habrías +habéis +había +habíais +habíamos +habían +habías +had +had +hadap +hadapan +hadde +hade +hade inte +hadn't +hai +haikki +haja +hajam +hajamos +hal +hallå +halten +hampir +han +han +han är +handelt +hanem +hanno +hans +hanya +hanyalah +happens +har +har inte +hardly +harga +hari +harian +harus +haruslah +harusnya +has +has +hasil +hasn't +hasta +hat +hatte +hatten +have +havemos +haven't +havia +having +hay +haya +hayamos +hayan +hayas +hayáis +he +he +he's +heb +hei +heißt +hej +hela +hello +help +helt +hem +hemos +hence +hence +hendak +hendaklah +hendaknya +hendes +henne +hennes +her +her +here +here's +hereafter +hereby +herein +hereupon +hers +herself +het +heu +heute +heutigen +hi +hi +hidup +hier +hij +him +himself +hin +hinaus +hingga +hinter +his +hiszen +hit +hitaasti +hither +hjälpa +ho +ho +hoch +hoe +hogy +hogyan +hohe +hohen +hoikein +hon +honom +hopefully +horas +houve +houvemos +houver +houvera +houveram +houverei +houverem +houveremos +houveria +houveriam +houvermos +houverá +houverão +houveríamos +houvesse +houvessem +houvéramos +houvéssemos +how +howbeit +howbeit +however +https +hube +hubiera +hubierais +hubieran +hubieras +hubieron +hubiese +hubieseis +hubiesen +hubieses +hubimos +hubiste +hubisteis +hubiéramos +hubiésemos +hubo +hubungan +hukum +hun +hur +hur som helst +huruvida +hutan +huvudsakligen +hva +hvad +hvem +hver +hvilken +hvis +hvor +hvordan +hvorfor +hvornår +hyvin +há +hão +hälsningar +hält +hän +händer +här +här finns +härefter +häri +härmed +härpå +hätte +hätten +häufig +hålla +håller +hålls +höger +i +i +i +i enlighet med detta +i'd +i'll +i'm +i've +ia +iaitu +ialah +iar +ibarat +ibaratkan +ibaratnya +ibland +ibu +ich +ich +ici +icke +if +igen +ignored +ignoreras +igual +iguals +ihm +ihn +ihnen +ihr +ihre +ihrem +ihren +ihrer +ihres +ii +ik +ikke +iklan +ikut +il +il +il +ile +ill +ill. +illetve +ilman +ilmu +ils +ilyen +ilyenkor +im +im +imi +immediate +immer +in +in +in +inainte +inapoi +inasmuch +inc +inc +inca +incit +ind +indeed +indeed +indeks +indem +india +indicate +indicated +indicates +indikera +indikerade +indikerar +indonesia +industri +informasi +informationsproblem +ingat +ingen +ingenstans +ingenting +inggris +ingin +inginkah +inginkan +ini +iniciar +inicio +inikah +inilah +inn +inna +innan +inne +innehålla +innehållande +innehåller +innen +inner +innerhalb +inny +innych +inom +inre +ins +insa +insbesondere +insgesamt +insofar +instead +inte +internasional +internationalen +intet +into +intr +intre +inward +inzwischen +inåt +io +io +ir +irá +is +is +isi +islam +ismét +isn't +isnin +ison +isso +ist +ista +iste +isto +istället +isu +isär +it +it'd +it'll +it's +italia +iti +its +itself +itt +itu +itukah +itulah +iż +j +j +ja +ja +jabatan +jadi +jadilah +jadinya +jag har +jag ska +jag skulle +jag är +jak +jakarta +jakaś +jakby +jaki +jakichś +jakie +jakiś +jakiż +jakkolwiek +jako +jakoś +jalan +jalur +jaman +jan +jangan +jangankan +janganlah +januari +jauh +jawa +jawab +jawaban +jawabnya +jawatan +jawatankuasa +je +je +jede +jedem +jeden +jeden +jeder +jedes +jedna +jednak +jednakże +jedno +jedoch +jeg +jego +jej +jelas +jelaskan +jelaslah +jelasnya +jemu +jene +jenem +jenen +jener +jenes +jenis +jepang +jepun +jeres +jerman +jest +jestem +jeszcze +jetzt +jeweils +jeśli +jeżeli +jika +jikalau +jiwa +jobban +jos +jual +jualan +juga +julai +jumaat +jumat +jumlah +jumlahnya +jun +juni +just +justru +juta +już +já +jälkeen +jó +jól +ją +k +kabar +kabupaten +kadar +kala +kalangan +kalau +kalaulah +kalaupun +kali +kalian +kalimantan +kam +kamen +kami +kamilah +kamis +kampanj +kamu +kamulah +kan +kan inte +kann +kanske +kanssa +kantor +kapal +kapan +kapankah +kapanpun +karena +karenanya +karya +kasus +kata +katakan +katakanlah +katanya +kaukana +kaum +kaunter +kawasan +każdy +ke +keadaan +kebetulan +kebutuhan +kecamatan +kecil +kedua +kedua-dua +keduanya +kedudukan +keep +keeps +kegiatan +kehidupan +kein +keine +keinem +keinen +keiner +keines +keinginan +kejadian +kekal +kelamaan +kelihatan +kelihatannya +kelima +kell +kellett +kelompok +keluar +keluarga +kelurahan +kembali +kementerian +kemudahan +kemudian +kemungkinan +kemungkinannya +kenaikan +kenapa +kenties +kenyataan +kepada +kepadanya +kepala +kepentingan +kept +keputusan +kerajaan +kerana +keres +keressünk +keresztül +kereta +kerja +kerjasama +kes +kesampaian +keselamatan +keseluruhan +keseluruhannya +kesempatan +kesihatan +keskellä +kesken +keterangan +keterlaluan +ketiga +ketika +ketua +keuntungan +kewangan +khamis +khusus +khususnya +ki +kiedy +kilka +kimś +kini +kinilah +kira +kira-kira +kiranya +kita +kitalah +klar +klart +klci +kleine +kleinen +klibor +klik +km +km +knapp +knappast +know +known +knows +kok +kom +komentar +komma +kommen +kommer +kommer du +kommer inte +kommt +kompas +komposit +kondisi +konnte +konnten +kontrak +korban +korea +kort +kos +koskaan +kota +kritiserade +kto +ktokolwiek +ktoś +która +które +którego +której +który +których +którym +którzy +ku +kuala +kuasa +kuinkan +kukka +kukuh +kumpulan +kun +kunde +kunde inte +kunna +kunne +kurang +kurangnya +kurs +kurz +kvartalsvis +kylliksi +kyllä +känd +kívül +könne +können +könnte +könnten +között +közül +l +l +l +l'hi +la +la +la +lag +lage +lagi +lagian +lagu +lah +lain +lainnya +laku +lalu +lama +lamanya +lang +lang +lange +langkah +langsung +lanjut +lanjutnya +laporan +las +lassen +last +lat +lately +later +latter +latterly +latterly +laut +laut +lav +le +le +le +least +leben +lebih +lecz +lediglich +legalább +legyen +lehet +lehetett +lei +lei +leicht +lembaga +lenne +lenni +lepas +les +les +less +lest +lest +lesz +let +let's +lett +letzte +letzten +leur +leurs +lewat +lhe +lhes +li +li +li +li'n +lidt +liegen +liegt +ließ +ligado +liian +lik +like +like +liked +likely +liknande +lille +lima +lingkungan +lite +little +lla +llavors +lo +lo +login +lokasi +look +looking +looks +lor +loro +loro +los +lot +ltd +ltd +luar +luas +lub +lui +lui +lui +lumpur +luona +lähellä +lämpligt +längs +läpi +lässt +läuft +långt +låt +låt oss +m +m +m'he +ma +ma +ma +mac +macam +machen +macht +machte +maga +magát +mahkamah +mahu +mai +mainly +maioria +maiorias +mais +majd +majlis +mają +maka +makanan +makanya +makin +maklumat +makt +mal +mal +malah +malahan +malam +malaysia +malgrat +mam +mampu +mampukah +man +man +mana +manakala +manalagi +manche +manchem +manchen +mancher +manches +mand +mange +mantan +manusia +many +mare +mas +masa +masalah +masalahnya +masih +masihkah +masing +masing-masing +masuk +masyarakat +mata +mateix +mateixa +mateixes +mateixos +mau +maupun +may +maybe +mało +me +me +mean +meanwhile +measure +med +medan +media +meg +meget +mehr +mehrere +mei +mein +meine +meinem +meinen +meiner +meines +meist +meisten +melainkan +melakukan +melalui +melawan +melihat +melihatnya +mellan +mellett +mely +melyek +memandangkan +memang +memastikan +membantu +membawa +memberi +memberikan +membolehkan +membuat +memerlukan +memihak +memiliki +meminta +memintakan +memisalkan +memperbuat +mempergunakan +memperkirakan +memperlihatkan +mempersiapkan +mempersoalkan +mempertanyakan +mempunyai +memulai +memungkinkan +men +menaiki +menambah +menambahkan +menandaskan +menanti +menantikan +menanya +menanyai +menanyakan +menarik +menawarkan +mencapai +mencari +mencatatkan +mendapat +mendapatkan +mendatang +mendatangi +mendatangkan +menegaskan +menerima +menerusi +mengadakan +mengakhiri +mengaku +mengalami +mengambil +mengapa +mengatakan +mengatakannya +mengenai +mengerjakan +mengetahui +menggalakkan +menggunakan +menghadapi +menghendaki +mengibaratkan +mengibaratkannya +mengikut +mengingat +mengingatkan +menginginkan +mengira +mengucapkan +mengucapkannya +mengumumkan +mengungkapkan +mengurangkan +meninggal +meningkat +meningkatkan +menjadi +menjalani +menjawab +menjelang +menjelaskan +menokok +mens +menteri +mentre +menuju +menunjuk +menunjuki +menunjukkan +menunjuknya +menurut +menuturkan +menyaksikan +menyampaikan +menyangkut +menyatakan +menyebabkan +menyebutkan +menyediakan +menyeluruh +menyiapkan +mer +merasa +mere +mereka +merekalah +merely +merosot +mert +merupakan +mes +meski +meskipun +mesmo +mest +mestadels +mesyuarat +met +metrotv +meu +meu +meus +meus +meva +meves +meyakini +meyakinkan +mi +mi +mi +mia +mia +mich +mie +mie +miei +miei +mig +might +mij +mikor +miksi +mikä +milik +militer +milloin +milloinkan +milyen +mimo +min +minat +minden +mindenki +mindent +mindestens +mindig +mindre +minggu +minha +minhas +minst +mint +minta +mintha +minyak +minä +mio +mio +mir +mirip +mis +misal +misalkan +misalnya +missä +mit +mit +miten +mivel +miért +między +mnie +mną +mobil +mod +modal +mogą +mohd +moi +moi +moim +moja +moje +molt +molta +moltes +molts +mon +mon +mons +more +moreover +most +most +mostly +mot +motsvarande +może +możliwe +można +mu +much +mucho +muchos +mudah +muito +muitos +mula +mulai +mulailah +mulanya +mult +multa +multe +multi +muncul +mungkin +mungkinkah +musi +musik +musim +muss +musste +must +muy +muß +my +my +mycket +mye +myself +mÅ +mÅte +már +más +más +másik +må +mån +många +måste +måte +még +més +même +mí +mía +mías +míg +mío +míos +mój +möchte +möglich +möjligt +müsse +müssen +n +n +n'he +n'hi +na +na +nach +nachdem +nad +nada +nagy +nagyobb +nagyon +nah +nahm +naik +nam +nama +name +namely +nami +namn +namun +nanti +nantinya +nas +nas +nasi +nasional +nasz +nasza +nasze +naszego +naszych +natomiast +natychmiast +natürlich +navn +nawet +nd +nd +ne +ne +ne +near +nearly +neben +necessary +ned +nedan +nedåt +need +needs +negara +negara-negara +negeri +negl +negl +negli +negli +nehmen +nei +nei +nei +neither +nekem +neki +nel +nel +nell +nell +nella +nella +nelle +nelle +nello +nello +nem +nem +neu +neue +neuen +never +nevertheless +new +new +next +ni +ni +niaga +nic +nich +nicht +nichts +nici +niciodata +nie +nie +niech +niego +niej +niemu +nigdy +nilai +nim +nimeni +nimi +nimic +nimmt +nincs +nine +nio +niste +nią +niż +no +no +nobody +noch +noe +noen +nog +nogen +nogensmenys +noget +noi +noi +noi +noll +nome +nomor +només +non +non +none +noone +nopeasti +nor +normally +normalt +nos +nosaltres +nosotras +nosotros +nossa +nossas +nosso +nossos +nostra +nostra +nostra +nostre +nostre +nostre +nostres +nostri +nostri +nostri +nostro +nostro +not +nothing +notre +nou +noua +noun +nous +nov +novel +november +novo +now +nowhere +nu +nuestra +nuestras +nuestro +nuestros +num +numa +numai +numeral +numeralia +nun +nur +ny +nya +nyaris +nyatanya +nyligen +nyt +nÅ +nÅr +não +nächsten +nämlich +nämligen +när +nära +närhelst +närvarande +nästa +nästan +nå +någon +någon annanstans +någonsin +någonstans +något +några +når +nær +næste +næsten +néha +néhány +nélkül +nós +nödvändigt +o +o +o +oavsett +ob +oben +obok +obviously +obwohl +och +också +od +oder +of +of +off +off +official +oft +ofta +often +og +ogos +ogsÅ +også +oh +oh +ohne +oi +oikea +oikealla +ok +ok +okay +okay +około +okt +oktober +olah +old +oleh +olehnya +olika +olyan +om +omedelbar +on +on +ona +once +onde +one +one +ones +oni +only +ono +ons +ont +onto +ook +op +operasi +opp +or +or +orang +oraz +organisasi +ori +orice +oricum +orsaka +orsaker +os +osannolikt +oss +other +others +otherwise +oto +otra +otras +otro +otros +ott +otte +ou +ought +our +ours +ourselves +out +outro +outside +ovan +over +over +overall +own +owszem +p +paar +pada +padahal +padanya +pagetracker +pagi +pai +pak +paling +paljon +pameran +pan +pana +pani +panjang +pantas +papan +par +para +para +paras +parca +parlimen +part +partai +parte +parti +particle +particular +particularly +pas +pas +pasar +pasaran +password +pasti +pastilah +pasukan +paticle +pe +pedig +pegar +pegawai +pejabat +pekan +pekerja +pel +pela +pelabur +pelaburan +pelancongan +pelanggan +pelas +pelbagai +pelo +pelos +pels +peluang +pemain +pembangunan +pemberita +pembinaan +pemerintah +pemerintahan +pemimpin +pendapatan +pendidikan +penduduk +penerbangan +pengarah +pengeluaran +pengerusi +pengguna +penggunaan +pengurusan +peniaga +peningkatan +penting +pentingnya +pentru +per +per +per +perancis +perang +peratus +perche +perchè +perché +percuma +perdagangan +perdana +perhaps +peringkat +perjanjian +perkara +perkhidmatan +perladangan +perlu +perlukah +perlunya +permintaan +pernah +perniagaan +pero +perquè +persekutuan +persen +persidangan +persoalan +persze +pertama +pertandingan +pertanyaan +pertanyakan +pertubuhan +pertumbuhan +perubahan +perusahaan +però +pesawat +peserta +pessoas +peste +petang +pihak +pihaknya +pilihan +pina +pinjaman +piu +più +più +placed +placeras +please +plus +plus +po +poc +poca +poco +pocs +pod +podczas +pode +poderá +podia +polis +polisi +politik +politische +politischen +pomimo +ponad +ponieważ +poques +por +porque +pos +posisi +possible +potser +pour +povo +powinien +powinna +powinni +powinno +poza +prawie +prea +presiden +prestasi +presumably +prin +pro +probably +produk +program +projek +promeiro +pronomia +pronoun +propi +proses +proton +provides +provinsi +przecież +przed +przede +przedtem +przez +przy +pt +pubdate +pukul +pula +pulau +pun +punkt +punya +pusat +putini +pÅ +på +på något sätt +q +qu +qual +qual +quale +quale +qualquer +quals +quan +quando +quant +quanta +quanta +quante +quante +quanti +quanti +quanto +quanto +quarterly +que +que +quel +quelcom +quella +quella +quelle +quelle +quelles +quelli +quelli +quello +quello +quels +quem +questa +questa +queste +queste +questi +questi +questo +questo +qui +qui +quien +quienes +quieto +quin +quina +quines +quins +quite +quote +què +qué +quê +r +rabu +radio +raja +rakan +rakyat +ramai +rantau +rasa +rasanya +rata +rather +raya +really +reasonably +recht +redan +regarding +regardless +regards +relatively +relativt +rendah +republik +resmi +respectively +respektive +rett +ribu +richtig +right +riktig +rimligen +ringgit +roku +roman +root +ruang +rumah +rund +runt +rupa +rupanya +rá +również +s +s +s'ha +s'han +sa +sa +saat +saatnya +sabah +saber +sabtu +sagen +sagt +sagte +sah +sahaja +saham +sai +said +saja +sajalah +saját +sakit +salah +sale +saling +sam +sama +sama-sama +sambil +same +samma +samme +sampai +sampaikan +sana +sangat +sangatlah +sannolikt +sans +sant +sarai +saranno +sarawak +sarebbe +sarebbero +sarei +saremmo +saremo +sareste +saresti +sarete +saro +sarà +sarò +satu +sau +saw +sawit +say +saya +sayalah +saying +says +scheint +schließen +schließlich +schnell +schon +schwer +sdn +se +se +se +sea +seamos +sean +seas +sebab +sebabnya +sebagai +sebagaimana +sebagainya +sebagian +sebahagian +sebaik +sebaiknya +sebaliknya +sebanyak +sebarang +sebegini +sebegitu +sebelah +sebelum +sebelumnya +sebenarnya +seberapa +sebesar +sebetulnya +sebisanya +sebuah +sebut +sebutlah +sebutnya +secara +sechs +second +secondly +secukupnya +sedan +sedang +sedangkan +sedemikian +sedikit +sedikitnya +see +seeing +seem +seemed +seeming +seems +seen +seenaknya +seg +segala +segalanya +segera +segi +seharusnya +sehen +sehingga +sehr +sei +seien +sein +seine +seinem +seinen +seiner +seines +seingat +seit +seja +sejak +sejam +sejamos +sejarah +sejauh +sejenak +sejumlah +sekadar +sekadarnya +sekali +sekali-kali +sekalian +sekaligus +sekalipun +sekarang +sekaranglah +sekecil +seketika +sekiranya +sekitar +sekitarnya +sekolah +seks +sektor +sekurang +sekurangnya +sekuriti +sela +selagi +selain +selaku +selalu +selama +selama-lamanya +selamanya +selanjutnya +selasa +selatan +selbst +selepas +self +seluruh +seluruhnya +selv +selves +sem +sem +semacam +semakin +semalam +semampu +semampunya +semasa +semasih +semata +semaunya +semblant +semblants +sementara +semisal +semisalnya +semmi +sempat +semua +semuanya +semula +sen +senare +sendiri +sendirian +sendirinya +senin +sensible +sent +seolah +seolah-olah +seorang +sepak +sepanjang +sepantasnya +sepantasnyalah +seperlunya +seperti +sepertinya +sepihak +sept +september +ser +ser +sera +serai +seraient +serais +serait +serangan +serantau +seras +serei +seremos +serez +seri +seria +seriam +seriez +serikat +sering +seringnya +serions +serious +seriously +serons +seront +serta +serupa +será +serán +serás +serão +seré +seréis +sería +seríais +seríamos +serían +serías +ses +ses +sesaat +sesama +sesampai +sesegera +sesekali +seseorang +sesi +sesuai +sesuatu +sesuatunya +sesudah +sesudahnya +setelah +setempat +setengah +seterusnya +setiap +setiausaha +setiba +setibanya +setidak +setidaknya +setinggi +sett +setzt +setzte +seu +seu +seus +seus +seusai +seva +seven +several +seves +sewaktu +sex +seáis +shall +sharply +she +should +shouldn't +si +si +si +sia +siamo +siano +siap +siapa +siapakah +siapapun +siaran +siate +sich +sicher +sidang +siden +sido +sie +sieben +siehe +sieht +siellä +siendo +siete +sin +sin +since +sind +sine +singapura +sini +sinilah +sint +sintem +sinä +sist +sista +sistem +six +się +sju +själv +själva +skal +skall +skarpt +skickas +skulle +skulle inte +skąd +slik +slutt +snarare +snart +so +soal +soalnya +sobie +sobre +sobre +sobretot +sobą +sogar +soi +soient +sois +soit +sok +sokat +sokkal +sokongan +solament +solche +solchem +solchen +solcher +solches +soll +sollen +sollte +sollten +sols +som +some +somebody +somehow +somente +someone +something +sometime +sometime +sometimes +somewhat +somewhere +sommes +somos +son +son +sondern +sono +sons +sonst +sont +soon +sorry +sorry +sota +sou +sou +sowie +sowohl +soy +soyez +soyons +speciellt +specificerade +specified +specify +specifying +spielen +spielt +spielte +sposób +spre +sprechen +spricht +später +sri +ssa +sta +sta +stai +stand +stando +stanno +starai +staranno +starebbe +starebbero +starei +staremmo +staremo +stareste +staresti +starete +stark +start +starà +starò +stasiun +statt +stava +stavamo +stavano +stavate +stavi +stavo +stehen +steht +stellen +stellt +stellte +stemmo +stesse +stessero +stessi +stessimo +steste +stesti +stette +stettero +stetti +stia +stiamo +stiano +stiate +still +stille +sto +stor +store +su +su +sua +sua +suara +suas +suatu +sub +sub +such +sudah +sudahkah +sudahlah +sue +sue +sugl +sugl +sugli +sugli +sui +sui +suis +sukan +suku +sul +sul +sull +sull +sulla +sulla +sulle +sulle +sullo +sullo +sumber +sungai +suo +suo +suoi +suoi +suoraan +sup +sup +supaya +sur +surat +sure +sus +sus +susut +suya +suyas +suyo +suyos +swoje +syarikat +syed +syv +sz +szemben +szerint +szinte +számol +számolnak +számára +szól +szólnak +são +säga +säger +säker +säkerligen +särskilt +sätt +så +sådan +såg +sålunda +sånn +såvida inte +sí +só +sóc +són +są +t +t +t s +t'ha +t'han +t'he +t's +ta +ta +tack +tacka +tadi +tadinya +tahap +tahu +tahun +tai +tak +taka +takana +take +taken +taki +takia +takie +także +tal +tal +talán +tam +tama +tambah +tambahnya +también +també +também +tampak +tampaknya +tampil +tampoc +tan +tanah +tandas +tandasnya +tanggal +tanpa +tant +tanta +tantes +tanto +tanya +tanyakan +tanyanya +tapi +tarpeeksi +tas +tatsächlich +tawaran +taz +te +te +tegas +tegasnya +tego +tehát +teilweise +tej +teknologi +telah +televisi +teljes +tell +tem +teman +temos +tempat +tempatan +tempo +tempo +tempoh +temu +ten +tenaga +tenderar +tendremos +tendrá +tendrán +tendrás +tendré +tendréis +tendría +tendríais +tendríamos +tendrían +tendrías +tends +tened +tenemos +tenga +tengah +tengamos +tengan +tengas +tengo +tengáis +tenha +tenham +tenhamos +tenho +tenida +tenidas +tenido +tenidos +teniendo +tentang +tentar +tentara +tentaram +tente +tentei +tentu +tentulah +tentunya +tenéis +tenía +teníais +teníamos +tenían +tenías +tepat +terakhir +terasa +teraz +terbaik +terbang +terbanyak +terbesar +terbuka +terdahulu +terdapat +terdiri +terei +teremos +terhadap +terhadapnya +teria +teriam +teringat +terjadi +terjadilah +terjadinya +terkait +terkenal +terkira +terlalu +terlebih +terletak +terlihat +termasuk +ternyata +tersampaikan +tersebut +tersebutlah +tertentu +tertuju +terus +terutama +terá +terão +teríamos +tes +testimoni +testimony +tetap +tetapi +teu +teu +teus +teus +teva +teve +teves +też +than +thank +thanks +thanx +thanx +that +that's +thats +the +the +their +theirs +them +themselves +then +thence +there +there's +thereafter +thereby +therefore +therein +theres +theres +thereupon +these +they +they'd +they'll +they're +they've +think +third +this +thorough +thoroughly +those +though +though +three +through +throughout +thru +thru +thus +ti +ti +ti +tiada +tiap +tiba +tid +tidak +tidakkah +tidaklah +tidaknya +tidigare +tiene +tienen +tienes +tiga +til +tilbake +till +tillgängliga +tillräckligt +tills +tillsammans +tillåta +tillåter +tilstand +tim +timbalan +timur +tindakan +tinggal +tinggi +tingkat +tinha +tinham +tipo +tive +tivemos +tiver +tivera +tiveram +tiverem +tivermos +tivesse +tivessem +tivéramos +tivéssemos +tjänsteman +to +to +toata +toate +tobie +tobą +tocmai +todo +todos +tog +together +toh +toi +tokoh +ton +ton +tons +too +took +tot +tota +totes +toteż +toti +tots +totul +totusi +tovább +továbbá +toward +towards +tra +tra +trabalhar +trabalho +trat +tre +tredje +tried +tries +tro +tror +trots +trotz +truly +try +try +trying +trzeba +tu +tu +tu +tua +tua +tuas +tue +tue +tun +tun +tunai +tunjuk +tuo +tuo +tuoi +tuoi +turun +turut +tus +tutaj +tutti +tutti +tutto +tutto +tutur +tuturnya +tuturor +tuve +tuviera +tuvierais +tuvieran +tuvieras +tuvieron +tuviese +tuvieseis +tuviesen +tuvieses +tuvimos +tuviste +tuvisteis +tuviéramos +tuviésemos +tuvo +tuya +tuyas +tuyo +tuyos +tv +två +två gånger +twice +two +twoi +twoim +twoja +twoje +twym +twój +ty +tych +tylko +tym +tyvärr +tässä +tém +têm +tínhamos +több +tú +u +uang +ucap +ucapnya +ud +udara +ugyanis +uit +ujar +ujarnya +ulkopuolella +um +uma +umas +umum +umumnya +un +un +un +una +una +una +und +unde +under +under +under tiden +undrar +une +unei +unele +uneori +unes +unescape +unfortunately +ungkap +ungkapnya +unii +unit +universitas +unless +unlikely +uno +uno +unor +unos +uns +uns +unse +unsem +unsen +unser +unsere +unserer +unses +unter +until +unto +untuk +untung +unui +unul +up +upaya +upon +upp +uppenbarligen +uppskatta +urus +us +us +usa +usah +usaha +usai +usar +use +used +useful +user +uses +using +usually +ut +utama +utan +utanför +utara +uten +utolsó +utom +után +utána +uucp +uucp +v +va +vad +vad är +vagy +vagyis +vagyok +vahemmän +vaig +valaki +valamely +valami +valamint +valor +value +való +vam +van +vanligtvis +vannak +var +var inte +var som helst +var är +vara +varefter +varför +varhelst +vari +varifrån +varigenom +various +varit +varje +varken +varpå +vars +vas +vasen +vasenmalla +vastan +ved +veja +vele +vem +vem är +ver +verdade +verdadeiro +verdi +vergangenen +verkade +verkar +verkligen +verkligt +verschiedene +verschiedenen +versi +version +versucht +verwendet +very +vet +veu +vi +vi +vi +vi har +vi kommer +vi skulle +vi är +via +via +vid +viel +viele +vielen +vielleicht +vielä +vier +vieressä +vil +vill +ville +villig +visas +viss +vissza +viszont +vite +viz +viz +você +vocês +voi +voi +voi +volna +volt +voltak +voltam +voltunk +vom +vom +von +vor +vor +vos +vosaltres +vosotras +vosotros +vostra +vostra +vostra +vostre +vostre +vostre +vostres +vostri +vostri +vostro +vostro +votre +vous +vreo +vreun +vs +vs +vuestra +vuestras +vuestro +vuestros +vÅr +vÖre +vÖrt +vähän +väl +väldigt +välkommen +vänligen +värde +vår +vårt +völlig +w +waduh +wah +wahai +wakil +waktu +waktunya +walau +walaupun +wam +wami +wang +wanita +want +wants +war +waren +warga +warst +warta +was +was +wasn't +wasz +wasza +wasze +wat +way +we +we +we'd +we'll +we're +we've +według +weg +wegen +weil +weit +weiter +weitere +weiteren +weiterhin +weiß +wel +welche +welchem +welchen +welcher +welches +welcome +well +wenig +weniger +wenn +went +wer +werde +werden +were +weren't +what +what's +whatever +when +whence +whenever +where +where's +whereafter +whereas +whereby +wherein +whereupon +wherever +whether +which +while +whither +who +who's +whoever +whole +whom +whose +why +wib +wichtig +wie +wieder +wiele +wielu +wij +wilayah +will +willing +wir +wird +wirklich +wirst +wish +wissen +with +within +without +więc +więcej +wo +wobei +wohl +wollen +wollte +won't +wonder +wong +word +worden +would +wouldn't +wszyscy +wszystkich +wszystkie +wszystkim +wszystko +wtedy +wurde +wurden +wy +während +wäre +würde +würden +właśnie +x +y +y +ya +ya +yaitu +yakin +yakni +yang +yes +yet +yhdessä +ylös +yo +you +you'd +you'll +you're +you've +your +yours +yourself +yourselves +ytterligare +z +z. +z.B. +za +zahlreiche +zal +zamana +zapewne +zawsze +ze +zehn +zei +zeigen +zeigt +zero +zij +znowu +znów +zo +został +zou +zs +zu +zudem +zuletzt +zum +zumindest +zunächst +zur +zurück +zusammen +zuvor +zwar +zwei +zweite +zweiten +zwischen +zł +Å +Österreich +Über +à +às +á +által +általában +át +än +ändå +ännu +är +är inte +även +även om +å +år +åt +åt sidan +åtta +è +é +é +él +én +éppen +éramos +érem +éreu +és +étaient +étais +était +étant +étiez +étions +été +étée +étées +étés +êtes +í +így +ó +ö +öffentlichen +önskar +össze +över +överallt +övergripande +överväga +överväger +ú +úgy +új +újabb +újra +último +ü +über +überhaupt +ő +ők +őket +ű +żaden +żadna +żadne +żadnych +że +żebya +алло +без +близко +более +больше +будем +будет +будете +будешь +будто +буду +будут +будь +бы +бывает +бывь +был +была +были +было +быть +в +важная +важное +важные +важный +вам +вами +вас +ваш +ваша +ваше +ваши +вверх +вдали +вдруг +ведь +везде +весь +вниз +внизу +во +вокруг +вон +восемнадцатый +восемнадцать +восемь +восьмой +вот +впрочем +времени +время +все +всегда +всего +всем +всеми +всему +всех +всею +всю +всюду +вся +всё +второй +вы +г +где +говорил +говорит +год +года +году +да +давно +даже +далеко +дальше +даром +два +двадцатый +двадцать +две +двенадцатый +двенадцать +двух +девятнадцатый +девятнадцать +девятый +девять +действительно +дел +день +десятый +десять +для +до +довольно +долго +должно +другая +другие +других +друго +другое +другой +е +его +ее +ей +ему +если +есть +еще +ещё +ею +её +ж +же +жизнь +за +занят +занята +занято +заняты +затем +зато +зачем +здесь +значит +и +из +или +им +именно +иметь +ими +имя +иногда +их +к +каждая +каждое +каждые +каждый +кажется +как +какая +какой +кем +когда +кого +ком +кому +конечно +которая +которого +которой +которые +который +которых +кроме +кругом +кто +куда +лет +ли +лишь +лучше +люди +м +мало +между +меля +менее +меньше +меня +миллионов +мимо +мира +мне +много +многочисленная +многочисленное +многочисленные +многочисленный +мной +мною +мог +могут +мож +может +можно +можхо +мои +мой +мор +мочь +моя +моё +мы +на +наверху +над +надо +назад +наиболее +наконец +нам +нами +нас +начала +наш +наша +наше +наши +не +него +недавно +недалеко +нее +ней +нельзя +нем +немного +нему +непрерывно +нередко +несколько +нет +нею +неё +ни +нибудь +ниже +низко +никогда +никуда +ними +них +ничего +но +ну +нужно +нх +о +об +оба +обычно +один +одиннадцатый +одиннадцать +однажды +однако +одного +одной +около +он +она +они +оно +опять +особенно +от +отовсюду +отсюда +очень +первый +перед +по +под +пожалуйста +позже +пока +пор +пора +после +посреди +потом +потому +почему +почти +прекрасно +при +про +просто +против +процентов +пятнадцатый +пятнадцать +пятый +пять +раз +разве +рано +раньше +рядом +с +сам +сама +сами +самим +самими +самих +само +самого +самой +самом +самому +саму +свое +своего +своей +свои +своих +свою +сеаой +себе +себя +сегодня +седьмой +сейчас +семнадцатый +семнадцать +семь +сих +сказал +сказала +сказать +сколько +слишком +сначала +снова +со +собой +собою +совсем +спасибо +стал +суть +т +та +так +такая +также +такие +такое +такой +там +твой +твоя +твоё +те +тебе +тебя +тем +теми +теперь +тех +то +тобой +тобою +тогда +того +тоже +только +том +тому +тот +тою +третий +три +тринадцатый +тринадцать +ту +туда +тут +ты +тысяч +у +уж +уже +уметь +хорошо +хотеть +хоть +хотя +хочешь +часто +чаще +чего +человек +чем +чему +через +четвертый +четыре +четырнадцатый +четырнадцать +что +чтоб +чтобы +чуть +шестнадцатый +шестнадцать +шестой +шесть +эта +эти +этим +этими +этих +это +этого +этой +этом +этому +этот +эту +я +، +أ +ا +اثر +اجل +احد +اخرى +اذا +اربعة +اطار +اعادة +اعلنت +اف +اكثر +اكد +الا +الاخيرة +الان +الاول +الاولى +التى +التي +الثاني +الثانية +الذاتي +الذى +الذي +الذين +السابق +الف +الماضي +المقبل +الوقت +الى +اليوم +اما +امام +امس +ان +انه +انها +او +اول +اي +ايار +ايام +ايضا +ب +باسم +بان +برس +بسبب +بشكل +بعد +بعض +بن +به +بها +بين +تم +ثلاثة +ثم +جميع +حاليا +حتى +حوالى +حول +حيث +حين +خلال +دون +ذلك +زيارة +سنة +سنوات +شخصا +صباح +صفرa +ضد +ضمن +عام +عاما +عدة +عدد +عدم +عشر +عشرة +على +عليه +عليها +عن +عند +عندما +غدا +غير +ـ +ف +فان +في +فيه +فيها +قال +قبل +قد +قوة +كان +كانت +كل +كلم +كما +لا +لدى +لقاء +لكن +للامم +لم +لن +له +لها +لوكالة +ما +مايو +مساء +مع +مقابل +مليار +مليون +من +منذ +منها +نحو +نفسه +نهاية +هذا +هذه +هناك +هو +هي +و +و6 +واحد +واضاف +واضافت +واكد +وان +واوضح +وفي +وقال +وقالت +وقد +وقف +وكان +وكانت +ولا +ولم +ومن +وهو +وهي +يكون +يمكن +يوم +一 +上 +下 +不 +与 +且 +个 +为 +乃 +么 +之 +也 +了 +于 +些 +亦 +人 +今 +仍 +从 +他 +以 +们 +但 +何 +你 +使 +儿 +其 +再 +几 +凡 +凭 +则 +别 +到 +即 +却 +去 +又 +及 +另 +只 +可 +各 +同 +后 +向 +吧 +和 +咱 +哇 +哟 +哪 +啥 +啦 +嗡 +嘛 +因 +在 +她 +好 +如 +它 +小 +尔 +已 +并 +当 +往 +很 +得 +怎 +您 +我 +或 +所 +打 +把 +拿 +据 +无 +既 +是 +曾 +最 +有 +来 +某 +此 +每 +比 +沿 +用 +由 +的 +看 +着 +给 +而 +自 +至 +致 +若 +虽 +被 +让 +该 +诸 +谁 +起 +趁 +距 +跟 +还 +这 +那 +随 +靠 +가 +같이 +고 +과 +과는 +과를 +과의 +까지 +까지는 +까지의 +께 +나 +는 +다 +대로 +도 +든 +라 +라고 +로 +로는 +로부터 +로의 +를 +만 +만에 +만을 +만의 +만이 +며 +밖에 +보다 +보다는 +부터 +부터는 +아 +야 +에 +에게 +에는 +에도 +에만 +에서 +에서는 +에서도 +에서의 +엔 +여 +와 +와의 +요 +으로 +으로는 +으로부터 +으로써 +으로의 +은 +을 +의 +이 +이고 +이나 +이다 +이라고 +이라는 +이며 +처럼 +치고 +토록 +하고 +ad +а +فى diff --git a/src/main/resources/com/gravity/goose/text/stopwords-ar.txt b/src/main/resources/com/gravity/goose/text/stopwords-ar.txt new file mode 100644 index 000000000..64e0e7160 --- /dev/null +++ b/src/main/resources/com/gravity/goose/text/stopwords-ar.txt @@ -0,0 +1,162 @@ +فى +في +كل +لم +لن +له +من +هو +هي +قوة +كما +لها +منذ +وقد +ولا +نفسه +لقاء +مقابل +هناك +وقال +وكان +نهاية +وقالت +وكانت +للامم +فيه +كلم +لكن +وفي +وقف +ولم +ومن +وهو +وهي +يوم +فيها +منها +مليار +لوكالة +يكون +يمكن +مليون +حيث +اكد +الا +اما +امس +السابق +التى +التي +اكثر +ايار +ايضا +ثلاثة +الذاتي +الاخيرة +الثاني +الثانية +الذى +الذي +الان +امام +ايام +خلال +حوالى +الذين +الاول +الاولى +بين +ذلك +دون +حول +حين +الف +الى +انه +اول +ضمن +انها +جميع +الماضي +الوقت +المقبل +اليوم +ـ +ف +و +و6 +قد +لا +ما +مع +مساء +هذا +واحد +واضاف +واضافت +فان +قبل +قال +كان +لدى +نحو +هذه +وان +واكد +كانت +واوضح +مايو +ب +ا +أ +، +عشر +عدد +عدة +عشرة +عدم +عام +عاما +عن +عند +عندما +على +عليه +عليها +زيارة +سنة +سنوات +تم +ضد +بعد +بعض +اعادة +اعلنت +بسبب +حتى +اذا +احد +اثر +برس +باسم +غدا +شخصا +صباح +اطار +اربعة +اخرى +بان +اجل +غير +بشكل +حاليا +بن +به +ثم +اف +ان +او +اي +بها +صفر \ No newline at end of file diff --git a/src/main/resources/com/gravity/goose/text/stopwords-ca.txt b/src/main/resources/com/gravity/goose/text/stopwords-ca.txt new file mode 100644 index 000000000..458ecf212 --- /dev/null +++ b/src/main/resources/com/gravity/goose/text/stopwords-ca.txt @@ -0,0 +1,219 @@ +a +abans +ací +ah +així +això +al +als +aleshores +algun +alguna +algunes +alguns +alhora +allà +allí +allò +altra +altre +altres +amb +ambdós +ambdues +apa +aquell +aquella +aquelles +aquells +aquest +aquesta +aquestes +aquests +aquí +baix +cada +cadascú +cadascuna +cadascunes +cadascuns +com +contra +d'un +d'una +d'unes +d'uns +dalt +de +del +dels +des +després +dins +dintre +donat +doncs +durant +e +eh +el +els +em +en +encara +ens +entre +érem +eren +éreu +es +és +esta +està +estàvem +estaven +estàveu +esteu +et +etc +ets +fins +fora +gairebé +ha +han +has +havia +he +hem +heu +hi +ho +i +igual +iguals +ja +l'hi +la +les +li +li'n +llavors +m'he +ma +mal +malgrat +mateix +mateixa +mateixes +mateixos +me +mentre +més +meu +meus +meva +meves +molt +molta +moltes +molts +mon +mons +n'he +n'hi +ne +ni +no +nogensmenys +només +nosaltres +nostra +nostre +nostres +o +oh +oi +on +pas +pel +pels +per +però +perquè +poc +poca +pocs +poques +potser +propi +qual +quals +quan +quant +que +què +quelcom +qui +quin +quina +quines +quins +s'ha +s'han +sa +semblant +semblants +ses +seu +seus +seva +seva +seves +si +sobre +sobretot +sóc +solament +sols +son +són +sons +sota +sou +t'ha +t'han +t'he +ta +tal +també +tampoc +tan +tant +tanta +tantes +teu +teus +teva +teves +ton +tons +tot +tota +totes +tots +un +una +unes +uns +us +va +vaig +vam +van +vas +veu +vosaltres +vostra +vostre +vostres diff --git a/src/main/resources/com/gravity/goose/text/stopwords-da.txt b/src/main/resources/com/gravity/goose/text/stopwords-da.txt new file mode 100644 index 000000000..e8522ef06 --- /dev/null +++ b/src/main/resources/com/gravity/goose/text/stopwords-da.txt @@ -0,0 +1,101 @@ +af +alle +andet +andre +at +begge +da +de +den +denne +der +deres +det +dette +dig +din +dog +du +ej +eller +en +end +ene +eneste +enhver +et +fem +fire +flere +fleste +for +fordi +forrige +fra +få +før +god +han +hans +har +hendes +her +hun +hvad +hvem +hver +hvilken +hvis +hvor +hvordan +hvorfor +hvornår +i +ikke +ind +ingen +intet +jeg +jeres +kan +kom +kommer +lav +lidt +lille +man +mand +mange +med +meget +men +mens +mere +mig +ned +ni +nogen +noget +ny +nyt +nær +næste +næsten +og +op +otte +over +på +se +seks +ses +som +stor +store +syv +ti +til +to +tre +ud +var diff --git a/src/main/resources/com/gravity/goose/text/stopwords-de.txt b/src/main/resources/com/gravity/goose/text/stopwords-de.txt new file mode 100644 index 000000000..67d418202 --- /dev/null +++ b/src/main/resources/com/gravity/goose/text/stopwords-de.txt @@ -0,0 +1,956 @@ + +/DIE +Ab +Aber +Abgeordneten +Alle +Allerdings +Als +Alter +Am +Amt +An +Anfang +Angaben +Antrag +April +Arbeit +Art +Artikel +Auch +Auf +Aufgabe +Augen +August +Aus +Außerdem +Bad +Band +Bau +Bayern +Bedeutung +Beginn +Begriff +Bei +Beifall +Beim +Beispiel +Bereich +Bericht +Berliner +Bevölkerung +Bild +Bilder +Bis +Blick +Buch +Bundesregierung +BÜNDNIS +Bürger +Bürgermeister +CDU +CDU/CSU +China +DM +Da +Dabei +Damit +Dann +Das +Daten +Dazu +Den +Denn +Der +Deshalb +Deutsche +Deutschen +Deutschland +Dezember +Die +Dienstag +Dies +Diese +Dieser +Dieses +Doch +Donnerstag +Dort +Dr +Dr. +Druck +Du +Durch +Ein +Eine +Einsatz +Einwohner +Eltern +Ende +Entscheidung +Entwicklung +Er +Erfolg +Ergebnis +Erst +Es +Euro +Europa +Europäischen +FC +Fall +Familie +Februar +Fenster +Film +Firma +Folge +Form +Frage +Fragen +Frankfurt +Frankfurter +Frankreich +Franz +Frau +Frauen +Freitag +Friedrich +Für +GRÜNEN +Gebiet +Geld +Gemeinde +Gemeinden +Geschichte +Gesellschaft +Gesetz +GmbH +Gott +Grund +Gruppe +Grünen +Hamburg +Hand +Hans +Haus +Hause +Heinrich +Herr +Herren +Herrn +Heute +Hier +Hilfe +Hälfte +Höhe +ISBN +Ich +Ihnen +Ihr +Ihre +Im +In +Informationen +Interesse +Internet +Ist +Italien +Ja +Jahr +Jahre +Jahren +Jahres +Jahrhundert +Jahrhunderts +Januar +Jetzt +Johann +John +Juli +Juni +Kampf +Karl +Karriere +Kilometer +Kind +Kinder +Kirche +Klaus +Kollegen +Kommission +Kopf +Kosten +Krieg +Kritik +Kultur +Kunst +Köln +König +Lage +Land +Landes +Leben +Leute +Liebe +Liste +Literatur +London +Länder +Ländern +Mai +Mal +Man +Mann +Mannheim +Mannschaft +Mark +Markt +Martin +Maßnahmen +Meine +Meinung +Menschen +Meter +Michael +Milliarden +Millionen +Minuten +Mit +Mitarbeiter +Mitglied +Mitglieder +Mitte +Mittel +Mittwoch +Monate +Monaten +Montag +Morgen +Musik +Mutter +Männer +März +Möglichkeit +München +Nach +Nachdem +Nacht +Name +Namen +Neben +Nein +Neue +New +Nicht +Noch +Norden +November +Nr. +Nun +Nur +Nähe +Oktober +Opfer +Ort +Osten +PDS +Paris +Parlament +Partei +Paul +Personen +Peter +Platz +Politik +Politiker +Polizei +Preis +Problem +Probleme +Programm +Prozent +Präsident +Punkt +Quellen +Rahmen +Rat +Raum +Recht +Regel +Regie +Regierung +Region +Reihe +Richtung +Rolle +SPD +Sache +Saison +Samstag +Schon +Schule +Schweiz +Schweizer +Sein +Seine +Seit +Seite +Seiten +September +Sicherheit +Sie +Siehe +Situation +So +Sohn +Soldaten +Sommer +Sonntag +Spiel +Spiele +Spieler +Sprache +St. +Staat +Staaten +Stadt +Stelle +Straße +Stunden +Stuttgart +System +Tag +Tage +Tagen +Team +Teil +Tel. +The +Thema +Thomas +Titel +Tochter +Tod +Trainer +USA +Uhr +Um +Und +Union +Universität +Unter +Unternehmen +Unterstützung +Vater +Verein +Verfahren +Verfügung +Verlag +Viele +Von +Vor +Wahl +Was +Wasser +Weblinks +Weg +Weise +Weitere +Welt +Wenn +Wer +Werk +Werke +Westen +Wie +Wien +Wilhelm +Wir +Wirtschaft +Woche +Wochen +Wolfgang +Wort +Während +York +Zahl +Zeit +Zeitung +Ziel +Zu +Zukunft +Zum +Zur +Zusammenarbeit +Zusammenhang +ab +aber +acht +alle +allein +allem +allen +aller +allerdings +alles +als +also +alte +alten +am +an +ander +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders +arbeiten +auch +auf +aufgrund +aus +außerdem +bald +bedeutet +befindet +begann +bei +beide +beiden +beim +beispielsweise +bekannt +bekommen +bereit +bereits +besonders +besser +besteht +besten +bevor +bezeichnet +bietet +bin +bis +bisher +bist +bleiben +bleibt +blieb +bringen +bzw. +c +ca. +da +dabei +dadurch +dafür +dagegen +daher +damals +damit +danach +dann +daran +darauf +darf +darin +darunter +darüber +das +dass +dasselbe +davon +dazu +daß +de +dein +deine +deinem +deinen +deiner +deines +dem +demselben +den +denen +denn +denselben +der +deren +derer +derselbe +derselben +derzeit +des +deshalb +desselben +dessen +deutlich +deutsche +deutschen +deutscher +dich +die +dies +diese +dieselbe +dieselben +diesem +diesen +dieser +dieses +dir +direkt +doch +dort +dpa +drei +du +durch +dürfen +eben +ebenfalls +ebenso +ehemaligen +eher +eigene +eigenen +eigentlich +ein +eine +einem +einen +einer +eines +einfach +eingesetzt +einig +einige +einigem +einigen +einiger +einiges +einmal +einzelnen +einzige +electronic +entwickelt +er +erhalten +erhielt +erklärt +erklärte +erneut +erreichen +erreicht +erst +erste +ersten +erster +erstmals +es +etwa +etwas +euch +euer +eure +eurem +euren +eurer +eures +europäischen +fand +fast +fest +finden +findet +folgenden +for +frei +früher +führen +führt +führte +fünf +für +gab +ganz +ganze +ganzen +gar +geben +gebracht +gefunden +gegeben +gegen +gegenüber +gegründet +gehen +geht +gehören +gehört +gehörte +gekommen +gemacht +gemeinsam +genannt +genau +genommen +genug +gerade +gesagt +gesehen +gestellt +gestern +gewann +gewesen +geworden +gibt +gilt +ging +gleich +gleichen +gleichzeitig +große +großen +großer +größte +größten +gut +gute +guten +hab +habe +haben +halten +handelt +hat +hatte +hatten +heißt +her +heute +heutigen +hier +hin +hinaus +hinter +hoch +hohe +hohen +hält +hätte +hätten +häufig +ich +ihm +ihn +ihnen +ihr +ihre +ihrem +ihren +ihrer +ihres +im +immer +in +indem +innerhalb +ins +insbesondere +insgesamt +internationalen +inzwischen +ist +ja +je +jede +jedem +jeden +jeder +jedes +jedoch +jene +jenem +jenen +jener +jenes +jetzt +jeweils +kam +kamen +kann +kaum +kein +keine +keinem +keinen +keiner +keines +klar +kleine +kleinen +km +knapp +kommen +kommt +konnte +konnten +kurz +könne +können +könnte +könnten +lag +lang +lange +lassen +laut +leben +lediglich +leicht +letzte +letzten +liegen +liegt +ließ +lässt +läuft +machen +macht +machte +mal +man +manche +manchem +manchen +mancher +manches +mehr +mehrere +mein +meine +meinem +meinen +meiner +meines +meist +meisten +mich +mindestens +mir +mit +muss +musste +muß +möchte +möglich +müsse +müssen +nach +nachdem +nahm +natürlich +neben +nehmen +neu +neue +neuen +nicht +nichts +nie +nimmt +noch +nun +nur +nächsten +nämlich +ob +oben +obwohl +oder +of +oft +ohne +paar +per +politische +politischen +pro +recht +richtig +rund +s +sagen +sagt +sagte +sah +scheint +schließen +schließlich +schnell +schon +schwer +sechs +sehen +sehr +sei +seien +sein +seine +seinem +seinen +seiner +seines +seit +selbst +setzt +setzte +sich +sicher +sie +sieben +siehe +sieht +sind +so +sogar +solche +solchem +solchen +solcher +solches +soll +sollen +sollte +sollten +sondern +sonst +sowie +sowohl +spielen +spielt +spielte +sprechen +spricht +später +stand +stark +statt +stehen +steht +stellen +stellt +stellte +tatsächlich +taz +teilweise +the +trat +trotz +tun +um +und +uns +unse +unsem +unsen +unser +unsere +unserer +unses +unter +vergangenen +verschiedene +verschiedenen +version +versucht +verwendet +viel +viele +vielen +vielleicht +vier +vom +von +vor +völlig +war +waren +warst +was +weg +wegen +weil +weit +weiter +weitere +weiteren +weiterhin +weiß +welche +welchem +welchen +welcher +welches +wenig +weniger +wenn +wer +werde +werden +wichtig +wie +wieder +will +wir +wird +wirklich +wirst +wissen +wo +wobei +wohl +wollen +wollte +worden +wurde +wurden +während +wäre +würde +würden +z. +z.B. +zahlreiche +zehn +zeigen +zeigt +zu +zudem +zuletzt +zum +zumindest +zunächst +zur +zurück +zusammen +zuvor +zwar +zwei +zweite +zweiten +zwischen +Österreich +Über +öffentlichen +über +überhaupt diff --git a/src/main/resources/com/gravity/goose/text/stopwords-en.txt b/src/main/resources/com/gravity/goose/text/stopwords-en.txt index d3a39543f..a4f1a030b 100644 --- a/src/main/resources/com/gravity/goose/text/stopwords-en.txt +++ b/src/main/resources/com/gravity/goose/text/stopwords-en.txt @@ -1,546 +1,545 @@ -a's -able -about -above -according -accordingly -across -actually -after -afterwards -again -against -ain't -all -allow -allows -almost -alone -along -already -also -although -always -am -among -amongst -an -and -another -any -anybody -anyhow -anyone -anything -anyway -anyways -anywhere -apart -appear -appreciate -appropriate -are -aren't -around -as -aside -ask -asking -associated -at -available -away -awfully -be -became -because -become -becomes -becoming -been -before -beforehand -behind -being -believe -below -beside -besides -best -better -between -beyond -both -brief -but -by -c -c'mon -c's -came -campaign -can -can't -cannot -cant -cause -causes -certain -certainly -changes -clearly -co -com -come -comes -concerning -consequently -consider -considering -contain -containing -contains -corresponding -could -couldn't -course -currently -definitely -described -despite -did -didn't -different -do -does -doesn't -doing -don't -done -down -downwards -during -each -edu -eight -either -else -elsewhere -enough -endorsed -entirely -especially -et -etc -even -ever -every -everybody -everyone -everything -everywhere -ex -exactly -example -except -far -few -fifth -first -financial -five -followed -following -follows -for -former -formerly -forth -four -from -further -furthermore -get -gets -getting -given -gives -go -goes -going -gone -got -gotten -greetings -had -hadn't -happens -hardly -has -hasn't -have -haven't -having -he -he's -hello -help -hence -her -here -here's -hereafter -hereby -herein -hereupon -hers -herself -hi -him -himself -his -hither -hopefully -how -howbeit -however -i'd -i'll -i'm -i've -if -ignored -immediate -in -inasmuch -inc -indeed -indicate -indicated -indicates -inner -insofar -instead -into -inward -is -isn't -it -it'd -it'll -it's -its -itself -just -keep -keeps -kept -know -knows -known -last -lately -later -latter -latterly -least -less -lest -let -let's -like -liked -likely -little -look -looking -looks -ltd -mainly -many -may -maybe -me -mean -meanwhile -merely -might -more -moreover -most -mostly -much -must -my -myself -name -namely -nd -near -nearly -necessary -need -needs -neither -never -nevertheless -new -next -nine -no -nobody -non -none -noone -nor -normally -not -nothing -novel -now -nowhere -obviously -of -off -often -oh -ok -okay -old -on -once -one -ones -only -onto -or -other -others -otherwise -ought -our -ours -ourselves -out -outside -over -overall -own -particular -particularly -per -perhaps -placed -please -plus -possible -presumably -probably -provides -quite -quote -quarterly -rather -really -reasonably -regarding -regardless -regards -relatively -respectively -right -said -same -saw -say -saying -says -second -secondly -see -seeing -seem -seemed -seeming -seems -seen -self -selves -sensible -sent -serious -seriously -seven -several -shall -she -should -shouldn't -since -six -so -some -somebody -somehow -someone -something -sometime -sometimes -somewhat -somewhere -soon -sorry -specified -specify -specifying -still -sub -such -sup -sure -t's -take -taken -tell -tends -than -thank -thanks -thanx -that -that's -thats -the -their -theirs -them -themselves -then -thence -there -there's -thereafter -thereby -therefore -therein -theres -thereupon -these -they -they'd -they'll -they're -they've -think -third -this -thorough -thoroughly -those -though -three -through -throughout -thru -thus -to -together -too -took -toward -towards -tried -tries -truly -try -trying -twice -two -under -unfortunately -unless -unlikely -until -unto -up -upon -us -use -used -useful -uses -using -usually -uucp -value -various -very -via -viz -vs -want -wants -was -wasn't -way -we -we'd -we'll -we're -we've -welcome -well -went -were -weren't -what -what's -whatever -when -whence -whenever -where -where's -whereafter -whereas -whereby -wherein -whereupon -wherever -whether -which -while -whither -who -who's -whoever -whole -whom -whose -why -will -willing -wish -with -within -without -won't -wonder -would -would -wouldn't -yes -yet -you -you'd -you'll -you're -you've -your -yours -yourself -yourselves -zero -official -sharply -criticized \ No newline at end of file +a's +able +about +above +according +accordingly +across +actually +after +afterwards +again +against +ain't +all +allow +allows +almost +alone +along +already +also +although +always +am +among +amongst +an +and +another +any +anybody +anyhow +anyone +anything +anyway +anyways +anywhere +apart +appear +appreciate +appropriate +are +aren't +around +as +aside +ask +asking +associated +at +available +away +awfully +be +became +because +become +becomes +becoming +been +before +beforehand +behind +being +believe +below +beside +besides +best +better +between +beyond +both +brief +but +by +c +c'mon +c's +came +campaign +can +can't +cannot +cant +cause +causes +certain +certainly +changes +clearly +co +com +come +comes +concerning +consequently +consider +considering +contain +containing +contains +corresponding +could +couldn't +course +criticized +currently +definitely +described +despite +did +didn't +different +do +does +doesn't +doing +don't +done +down +downwards +during +each +edu +eight +either +else +elsewhere +endorsed +enough +entirely +especially +et +etc +even +ever +every +everybody +everyone +everything +everywhere +ex +exactly +example +except +far +few +fifth +financial +first +five +followed +following +follows +for +former +formerly +forth +four +from +further +furthermore +get +gets +getting +given +gives +go +goes +going +gone +got +gotten +greetings +had +hadn't +happens +hardly +has +hasn't +have +haven't +having +he +he's +hello +help +hence +her +here +here's +hereafter +hereby +herein +hereupon +hers +herself +hi +him +himself +his +hither +hopefully +how +howbeit +however +i'd +i'll +i'm +i've +if +ignored +immediate +in +inasmuch +inc +indeed +indicate +indicated +indicates +inner +insofar +instead +into +inward +is +isn't +it +it'd +it'll +it's +its +itself +just +keep +keeps +kept +know +known +knows +last +lately +later +latter +latterly +least +less +lest +let +let's +like +liked +likely +little +look +looking +looks +ltd +mainly +many +may +maybe +me +mean +meanwhile +merely +might +more +moreover +most +mostly +much +must +my +myself +name +namely +nd +near +nearly +necessary +need +needs +neither +never +nevertheless +new +next +nine +no +nobody +non +none +noone +nor +normally +not +nothing +novel +now +nowhere +obviously +of +off +official +often +oh +ok +okay +old +on +once +one +ones +only +onto +or +other +others +otherwise +ought +our +ours +ourselves +out +outside +over +overall +own +particular +particularly +per +perhaps +placed +please +plus +possible +presumably +probably +provides +quarterly +quite +quote +rather +really +reasonably +regarding +regardless +regards +relatively +respectively +right +said +same +saw +say +saying +says +second +secondly +see +seeing +seem +seemed +seeming +seems +seen +self +selves +sensible +sent +serious +seriously +seven +several +shall +sharply +she +should +shouldn't +since +six +so +some +somebody +somehow +someone +something +sometime +sometimes +somewhat +somewhere +soon +sorry +specified +specify +specifying +still +sub +such +sup +sure +t's +take +taken +tell +tends +than +thank +thanks +thanx +that +that's +thats +the +their +theirs +them +themselves +then +thence +there +there's +thereafter +thereby +therefore +therein +theres +thereupon +these +they +they'd +they'll +they're +they've +think +third +this +thorough +thoroughly +those +though +three +through +throughout +thru +thus +to +together +too +took +toward +towards +tried +tries +truly +try +trying +twice +two +under +unfortunately +unless +unlikely +until +unto +up +upon +us +use +used +useful +uses +using +usually +uucp +value +various +very +via +viz +vs +want +wants +was +wasn't +way +we +we'd +we'll +we're +we've +welcome +well +went +were +weren't +what +what's +whatever +when +whence +whenever +where +where's +whereafter +whereas +whereby +wherein +whereupon +wherever +whether +which +while +whither +who +who's +whoever +whole +whom +whose +why +will +willing +wish +with +within +without +won't +wonder +would +wouldn't +yes +yet +you +you'd +you'll +you're +you've +your +yours +yourself +yourselves +zero diff --git a/src/main/resources/com/gravity/goose/text/stopwords-es.txt b/src/main/resources/com/gravity/goose/text/stopwords-es.txt new file mode 100644 index 000000000..3aae76b23 --- /dev/null +++ b/src/main/resources/com/gravity/goose/text/stopwords-es.txt @@ -0,0 +1,310 @@ + +# forms of ser, to be (not including the infinitive): +a +al +algo +algunas +algunos +ante +antes +como +con +contra +cual +cuando +de +del +desde +donde +durante +e +el +ella +ellas +ellos +en +entre +era +erais +eran +eras +eres +es +esa +esas +ese +eso +esos +esta +estaba +estabais +estaban +estabas +estad +estada +estadas +estado +estados +estamos +estando +estar +estaremos +estará +estarán +estarás +estaré +estaréis +estaría +estaríais +estaríamos +estarían +estarías +estas +este +estemos +esto +estos +estoy +estuve +estuviera +estuvierais +estuvieran +estuvieras +estuvieron +estuviese +estuvieseis +estuviesen +estuvieses +estuvimos +estuviste +estuvisteis +estuviéramos +estuviésemos +estuvo +está +estábamos +estáis +están +estás +esté +estéis +estén +estés +fue +fuera +fuerais +fueran +fueras +fueron +fuese +fueseis +fuesen +fueses +fui +fuimos +fuiste +fuisteis +fuéramos +fuésemos +ha +habida +habidas +habido +habidos +habiendo +habremos +habrá +habrán +habrás +habré +habréis +habría +habríais +habríamos +habrían +habrías +habéis +había +habíais +habíamos +habían +habías +han +has +hasta +hay +haya +hayamos +hayan +hayas +hayáis +he +hemos +hube +hubiera +hubierais +hubieran +hubieras +hubieron +hubiese +hubieseis +hubiesen +hubieses +hubimos +hubiste +hubisteis +hubiéramos +hubiésemos +hubo +la +las +le +les +lo +los +me +mi +mis +mucho +muchos +muy +más +mí +mía +mías +mío +míos +nada +ni +no +nos +nosotras +nosotros +nuestra +nuestras +nuestro +nuestros +o +os +otra +otras +otro +otros +para +pero +poco +por +porque +que +quien +quienes +qué +se +sea +seamos +sean +seas +seremos +será +serán +serás +seré +seréis +sería +seríais +seríamos +serían +serías +seáis +sido +siendo +sin +sobre +sois +somos +son +soy +su +sus +suya +suyas +suyo +suyos +sí +también +tanto +te +tendremos +tendrá +tendrán +tendrás +tendré +tendréis +tendría +tendríais +tendríamos +tendrían +tendrías +tened +tenemos +tenga +tengamos +tengan +tengas +tengo +tengáis +tenida +tenidas +tenido +tenidos +teniendo +tenéis +tenía +teníais +teníamos +tenían +tenías +ti +tiene +tienen +tienes +todo +todos +tu +tus +tuve +tuviera +tuvierais +tuvieran +tuvieras +tuvieron +tuviese +tuvieseis +tuviesen +tuvieses +tuvimos +tuviste +tuvisteis +tuviéramos +tuviésemos +tuvo +tuya +tuyas +tuyo +tuyos +tú +un +una +uno +unos +vosotras +vosotros +vuestra +vuestras +vuestro +vuestros +y +ya +yo +él +éramos diff --git a/src/main/resources/com/gravity/goose/text/stopwords-fi.txt b/src/main/resources/com/gravity/goose/text/stopwords-fi.txt new file mode 100644 index 000000000..3b468b32c --- /dev/null +++ b/src/main/resources/com/gravity/goose/text/stopwords-fi.txt @@ -0,0 +1,68 @@ +alla +ansiosta +ehkä +ei +enemmän +ennen +etessa +f +haikki +he +hitaasti +hoikein +hyvin +hän +ilman +ja +jos +jälkeen +kanssa +kaukana +kenties +keskellä +kesken +koskaan +kuinkan +kukka +kylliksi +kyllä +liian +lla +lla +luona +lähellä +läpi +me +miksi +mikä +milloin +milloinkan +minä +missä +miten +nopeasti +nyt +oikea +oikealla +paljon +siellä +sinä +ssa +sta +suoraan +tai +takana +takia +tarpeeksi +te +tässä +ulkopuolella +vahemmän +vasen +vasenmalla +vastan +vielä +vieressä +vähän +yhdessä +ylös diff --git a/src/main/resources/com/gravity/goose/text/stopwords-fr.txt b/src/main/resources/com/gravity/goose/text/stopwords-fr.txt new file mode 100644 index 000000000..13fae674b --- /dev/null +++ b/src/main/resources/com/gravity/goose/text/stopwords-fr.txt @@ -0,0 +1,163 @@ +ai +aie +aient +aies +ait +as +au +aura +aurai +auraient +aurais +aurait +auras +aurez +auriez +aurions +aurons +auront +aux +avaient +avais +avait +avec +avez +aviez +avions +avons +ayant +ayez +ayons +c +ce +ceci +celà +ces +cet +cette +d +dans +de +des +du +elle +en +es +est +et +eu +eue +eues +eurent +eus +eusse +eussent +eusses +eussiez +eussions +eut +eux +eûmes +eût +eûtes +furent +fus +fusse +fussent +fusses +fussiez +fussions +fut +fûmes +fût +fûtes +ici +il +ils +j +je +l +la +le +les +leur +leurs +lui +m +ma +mais +me +mes +moi +mon +même +n +ne +nos +notre +nous +on +ont +ou +par +pas +pour +qu +que +quel +quelle +quelles +quels +qui +s +sa +sans +se +sera +serai +seraient +serais +serait +seras +serez +seriez +serions +serons +seront +ses +soi +soient +sois +soit +sommes +son +sont +soyez +soyons +suis +sur +t +ta +te +tes +toi +ton +tu +un +une +vos +votre +vous +y +à +étaient +étais +était +étant +étiez +étions +été +étée +étées +étés +êtes diff --git a/src/main/resources/com/gravity/goose/text/stopwords-hu.txt b/src/main/resources/com/gravity/goose/text/stopwords-hu.txt new file mode 100644 index 000000000..694feb102 --- /dev/null +++ b/src/main/resources/com/gravity/goose/text/stopwords-hu.txt @@ -0,0 +1,403 @@ +a +á +ahogy +ahol +aki +akik +akkor +alatt +által +általában +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amit +amolyan +amp +amíg +amikor +át +abban +ahhoz +annak +arra +arról +az +azok +azon +azt +azzal +azért +aztán +azután +azonban +b +bár +be +belül +benne +c +cikk +cikkek +cikkeket +csak +d +de +e +é +eddig +egész +egy +egyes +egyetlen +egyéb +egyik +egyre +ekkor +el +elég +ellen +elő +először +előtt +első +én +éppen +ebben +ehhez +emilyen +ennek +erre +ez +ezt +ezek +ezen +ezzel +ezért +és +f +fel +felé +g +h +hanem +hiszen +hogy +hogyan +i +í +igen +így +illetve +ill. +ill +ilyen +ilyenkor +is +ison +ismét +itt +j +jó +jól +jobban +k +kell +kellett +keresztül +keressünk +ki +kívül +között +közül +l +legalább +lehet +lehetett +legyen +lenne +lenni +lesz +lett +m +maga +magát +majd +majd +már +más +másik +meg +még +mellett +mert +mely +melyek +mi +mit +míg +miért +milyen +mikor +minden +mindent +mindenki +mindig +mint +mintha +mivel +most +n +nagy +nagyobb +nagyon +ne +néha +nekem +neki +nem +néhány +nélkül +nincs +o +ó +olyan +ott +össze +ö +ő +ők +őket +p +pedig +persze +q +r +rá +s +saját +sem +semmi +sok +sokat +sokkal +sz +számára +szemben +szerint +szinte +t +talán +tehát +teljes +tovább +továbbá +több +u +ú +úgy +ugyanis +új +újabb +újra +után +utána +utolsó +ü +ű +v +vagy +vagyis +valaki +valamely +valami +valamint +való +vagyok +van +vannak +volt +voltam +voltak +voltunk +vissza +vele +viszont +volna +számolnak +szólnak +szól +w +x +y +z +zs +a +ahogy +ahol +aki +akkor +alatt +általában +által +amely +amíg +amikor +ami +amolyan +arra +át +az +azért +azonban +azon +aztán +azt +azután +azzal +bár +be +belül +benne +cikk +csak +de +eddig +egész +egy +egyéb +egyes +egyetlen +egyik +egyre +ekkor +el +elég +ellen +elő +először +előtt +első +emilyen +én +éppen +erre +és +e +ez +ezen +ezért +ezzel +fel +felé +hanem +hiszen +hogy +hogyan +igen +így +ill. +illetve +ill +ilyen +ilyenkor +ismét +ison +itt +jó +jobban +jól +kell +keres +keresztül +ki +kívül +között +közül +legalább +legyen +lehet +lenni +lett +maga +maga +majd +már +más +másik +még +meg +mellett +mely +mert +miért +míg +mikor +milyen +minden +mindenki +mindig +mi +mint +mintha +mivel +most +nagy +nagyobb +nagyon +ne +néha +néhány +neki +nélkül +nem +nincs +ők +olyan +ő +össze +ott +pedig +persze +rá +saját +s +sem +semmi +sokkal +sok +számára +számol +szemben +szerint +szinte +szól +talán +tehát +teljes +továbbá +tovább +úgy +ugyanis +új +újabb +újra +utána +után +utolsó +vagy +vagyis +valaki +valamely +valami +valamint +való +van +vissza +viszont +volt + diff --git a/src/main/resources/com/gravity/goose/text/stopwords-id.txt b/src/main/resources/com/gravity/goose/text/stopwords-id.txt new file mode 100644 index 000000000..418f43f9a --- /dev/null +++ b/src/main/resources/com/gravity/goose/text/stopwords-id.txt @@ -0,0 +1,1309 @@ +a +abad +acara +aceh +ada +adalah +adanya +adapun +agak +agaknya +agama +agar +agustus +air +akan +akankah +akhir +akhiri +akhirnya +akibat +aku +akulah +alam +album +amat +amatlah +amerika +anak +and +anda +andalah +anggota +antar +antara +antarabangsa +antaranya +apa +apaan +apabila +apakah +apalagi +apatah +api +april +artikel +artinya +as +asal +asalkan +asas +asia +asing +atas +atau +ataukah +ataupun +australia +awal +awalnya +awam +b +badan +bagai +bagaikan +bagaimana +bagaimanakah +bagaimanapun +bagainamakah +bagi +bagian +bahagian +bahan +baharu +bahasa +bahawa +bahkan +bahwa +bahwasannya +bahwasanya +baik +baiknya +bakal +bakalan +balik +bandar +bangsa +bank +banyak +bapak +barang +barangan +barat +baru +baru-baru +bawah +beberapa +begini +beginian +beginikah +beginilah +begitu +begitukah +begitulah +begitupun +bekas +bekerja +belakang +belakangan +belanda +beli +beliau +belum +belumlah +benar +benarkah +benarlah +bentuk +berada +berakhir +berakhirlah +berakhirnya +berapa +berapakah +berapalah +berapapun +berarti +berasal +berat +berawal +berbagai +berbanding +berbeda +berdasarkan +berdatangan +berharap +berhasil +beri +berikan +berikut +berikutan +berikutnya +berita +berjalan +berjaya +berjumlah +berkaitan +berkali +berkali-kali +berkata +berkehendak +berkeinginan +berkenaan +berlainan +berlaku +berlalu +berlangsung +berlebihan +bermacam +bermacam-macam +bermain +bermaksud +bermula +bernama +bernilai +bersama +bersama-sama +bersiap +bertanya +bertemu +berturut +bertutur +berubah +berujar +berupa +besar +besok +betul +betulkah +bhd +biasa +biasanya +bidang +bila +bilakah +bilion +bintang +bisa +bisakah +blog +bn +bola +boleh +bolehkah +bolehlah +buat +bukan +bukankah +bukanlah +bukannya +buku +bulan +bumi +bung +bursa +cadangan +cara +caranya +catch +china +click +code +copyright +cukup +cukupkah +cukuplah +cuma +daerah +dagangan +dahulu +dalam +dan +dana +dapat +dari +daripada +dasar +data +datang +datuk +dekat +demi +demikian +demikianlah +dengan +depan +derivatives +desa +desember +detik +dewan +di +dia +diadakan +diakhiri +diakhirinya +dialah +dianggap +diantara +diantaranya +diberi +diberikan +diberikannya +dibuat +dibuatnya +dibuka +dicatatkan +didapat +didatangkan +didirikan +diduga +digunakan +diibaratkan +diibaratkannya +diingat +diingatkan +diinginkan +dijangka +dijawab +dijelaskan +dijelaskannya +dikarenakan +dikatakan +dikatakannya +dikenal +dikerjakan +diketahui +diketahuinya +dikira +dilakukan +dilalui +dilihat +dimaksud +dimaksudkan +dimaksudkannya +dimaksudnya +dimana +diminta +dimintai +dimisalkan +dimulai +dimulailah +dimulainya +dimungkinkan +dini +diniagakan +dipastikan +diperbuat +diperbuatnya +dipergunakan +diperkirakan +diperlihatkan +diperlukan +diperlukannya +dipersoalkan +dipertanyakan +dipunyai +diri +dirilis +dirinya +dis +disampaikan +disebut +disebutkan +disebutkannya +disember +disini +disinilah +distrik +ditambahkan +ditandaskan +ditanya +ditanyai +ditanyakan +ditegaskan +ditemukan +ditujukan +ditunjuk +ditunjuki +ditunjukkan +ditunjukkannya +ditunjuknya +ditutup +dituturkan +dituturkannya +diucapkan +diucapkannya +diungkapkan +document.write +dolar +dong +dr +dua +dulu +dunia +effective +ekonomi +eksekutif +eksport +empat +enam +enggak +enggaknya +entah +entahlah +era +eropa +err +faedah +feb +film +gat +gedung +gelar +gettracker +global +grup +guna +gunakan +gunung +hadap +hadapan +hal +hampir +hanya +hanyalah +harga +hari +harian +harus +haruslah +harusnya +hasil +hendak +hendaklah +hendaknya +hidup +hingga +https +hubungan +hukum +hutan +i +ia +iaitu +ialah +ibarat +ibaratkan +ibaratnya +ibu +ii +iklan +ikut +ilmu +indeks +india +indonesia +industri +informasi +ingat +inggris +ingin +inginkah +inginkan +ini +inikah +inilah +internasional +islam +isnin +isu +italia +itu +itukah +itulah +jabatan +jadi +jadilah +jadinya +jakarta +jalan +jalur +jaman +jan +jangan +jangankan +janganlah +januari +jauh +jawa +jawab +jawaban +jawabnya +jawatan +jawatankuasa +jelas +jelaskan +jelaslah +jelasnya +jenis +jepang +jepun +jerman +jika +jikalau +jiwa +jual +jualan +juga +julai +jumaat +jumat +jumlah +jumlahnya +jun +juni +justru +juta +kabar +kabupaten +kadar +kala +kalangan +kalau +kalaulah +kalaupun +kali +kalian +kalimantan +kami +kamilah +kamis +kamu +kamulah +kan +kantor +kapal +kapan +kapankah +kapanpun +karena +karenanya +karya +kasus +kata +katakan +katakanlah +katanya +kaunter +kawasan +ke +keadaan +kebetulan +kebutuhan +kecamatan +kecil +kedua +kedua-dua +keduanya +kedudukan +kegiatan +kehidupan +keinginan +kejadian +kekal +kelamaan +kelihatan +kelihatannya +kelima +kelompok +keluar +keluarga +kelurahan +kembali +kementerian +kemudahan +kemudian +kemungkinan +kemungkinannya +kenaikan +kenapa +kenyataan +kepada +kepadanya +kepala +kepentingan +keputusan +kerajaan +kerana +kereta +kerja +kerjasama +kes +kesampaian +keselamatan +keseluruhan +keseluruhannya +kesempatan +kesihatan +keterangan +keterlaluan +ketiga +ketika +ketua +keuntungan +kewangan +khamis +khusus +khususnya +kini +kinilah +kira +kira-kira +kiranya +kita +kitalah +klci +klibor +klik +km +kok +komentar +kompas +komposit +kondisi +kontrak +korban +korea +kos +kota +kuala +kuasa +kukuh +kumpulan +kurang +kurangnya +lagi +lagian +lagu +lah +lain +lainnya +laku +lalu +lama +lamanya +langkah +langsung +lanjut +lanjutnya +laporan +laut +lebih +lembaga +lepas +lewat +lima +lingkungan +login +lokasi +lot +luar +luas +lumpur +mac +macam +mahkamah +mahu +majlis +maka +makanan +makanya +makin +maklumat +malah +malahan +malam +malaysia +mampu +mampukah +mana +manakala +manalagi +mantan +manusia +masa +masalah +masalahnya +masih +masihkah +masing +masing-masing +masuk +masyarakat +mata +mau +maupun +measure +media +mei +melainkan +melakukan +melalui +melawan +melihat +melihatnya +memandangkan +memang +memastikan +membantu +membawa +memberi +memberikan +membolehkan +membuat +memerlukan +memihak +memiliki +meminta +memintakan +memisalkan +memperbuat +mempergunakan +memperkirakan +memperlihatkan +mempersiapkan +mempersoalkan +mempertanyakan +mempunyai +memulai +memungkinkan +menaiki +menambah +menambahkan +menandaskan +menanti +menantikan +menanya +menanyai +menanyakan +menarik +menawarkan +mencapai +mencari +mencatatkan +mendapat +mendapatkan +mendatang +mendatangi +mendatangkan +menegaskan +menerima +menerusi +mengadakan +mengakhiri +mengaku +mengalami +mengambil +mengapa +mengatakan +mengatakannya +mengenai +mengerjakan +mengetahui +menggalakkan +menggunakan +menghadapi +menghendaki +mengibaratkan +mengibaratkannya +mengikut +mengingat +mengingatkan +menginginkan +mengira +mengucapkan +mengucapkannya +mengumumkan +mengungkapkan +mengurangkan +meninggal +meningkat +meningkatkan +menjadi +menjalani +menjawab +menjelang +menjelaskan +menokok +menteri +menuju +menunjuk +menunjuki +menunjukkan +menunjuknya +menurut +menuturkan +menyaksikan +menyampaikan +menyangkut +menyatakan +menyebabkan +menyebutkan +menyediakan +menyeluruh +menyiapkan +merasa +mereka +merekalah +merosot +merupakan +meski +meskipun +mesyuarat +metrotv +meyakini +meyakinkan +milik +militer +minat +minggu +minta +minyak +mirip +misal +misalkan +misalnya +mobil +modal +mohd +mudah +mula +mulai +mulailah +mulanya +muncul +mungkin +mungkinkah +musik +musim +nah +naik +nama +namun +nanti +nantinya +nasional +negara +negara-negara +negeri +new +niaga +nilai +nomor +noun +nov +november +numeral +numeralia +nya +nyaris +nyatanya +of +ogos +okt +oktober +olah +oleh +olehnya +operasi +orang +organisasi +pada +padahal +padanya +pagetracker +pagi +pak +paling +pameran +panjang +pantas +papan +para +paras +parlimen +partai +parti +particle +pasar +pasaran +password +pasti +pastilah +pasukan +paticle +pegawai +pejabat +pekan +pekerja +pelabur +pelaburan +pelancongan +pelanggan +pelbagai +peluang +pemain +pembangunan +pemberita +pembinaan +pemerintah +pemerintahan +pemimpin +pendapatan +pendidikan +penduduk +penerbangan +pengarah +pengeluaran +pengerusi +pengguna +penggunaan +pengurusan +peniaga +peningkatan +penting +pentingnya +per +perancis +perang +peratus +percuma +perdagangan +perdana +peringkat +perjanjian +perkara +perkhidmatan +perladangan +perlu +perlukah +perlunya +permintaan +pernah +perniagaan +persekutuan +persen +persidangan +persoalan +pertama +pertandingan +pertanyaan +pertanyakan +pertubuhan +pertumbuhan +perubahan +perusahaan +pesawat +peserta +petang +pihak +pihaknya +pilihan +pinjaman +polis +polisi +politik +pos +posisi +presiden +prestasi +produk +program +projek +pronomia +pronoun +proses +proton +provinsi +pt +pubdate +pukul +pula +pulau +pun +punya +pusat +rabu +radio +raja +rakan +rakyat +ramai +rantau +rasa +rasanya +rata +raya +rendah +republik +resmi +ribu +ringgit +root +ruang +rumah +rupa +rupanya +saat +saatnya +sabah +sabtu +sahaja +saham +saja +sajalah +sakit +salah +saling +sama +sama-sama +sambil +sampai +sampaikan +sana +sangat +sangatlah +sarawak +satu +sawit +saya +sayalah +sdn +se +sebab +sebabnya +sebagai +sebagaimana +sebagainya +sebagian +sebahagian +sebaik +sebaiknya +sebaliknya +sebanyak +sebarang +sebegini +sebegitu +sebelah +sebelum +sebelumnya +sebenarnya +seberapa +sebesar +sebetulnya +sebisanya +sebuah +sebut +sebutlah +sebutnya +secara +secukupnya +sedang +sedangkan +sedemikian +sedikit +sedikitnya +seenaknya +segala +segalanya +segera +segi +seharusnya +sehingga +seingat +sejak +sejarah +sejauh +sejenak +sejumlah +sekadar +sekadarnya +sekali +sekali-kali +sekalian +sekaligus +sekalipun +sekarang +sekaranglah +sekecil +seketika +sekiranya +sekitar +sekitarnya +sekolah +sektor +sekurang +sekurangnya +sekuriti +sela +selagi +selain +selaku +selalu +selama +selama-lamanya +selamanya +selanjutnya +selasa +selatan +selepas +seluruh +seluruhnya +semacam +semakin +semalam +semampu +semampunya +semasa +semasih +semata +semaunya +sementara +semisal +semisalnya +sempat +semua +semuanya +semula +sen +sendiri +sendirian +sendirinya +senin +seolah +seolah-olah +seorang +sepak +sepanjang +sepantasnya +sepantasnyalah +seperlunya +seperti +sepertinya +sepihak +sept +september +serangan +serantau +seri +serikat +sering +seringnya +serta +serupa +sesaat +sesama +sesampai +sesegera +sesekali +seseorang +sesi +sesuai +sesuatu +sesuatunya +sesudah +sesudahnya +setelah +setempat +setengah +seterusnya +setiap +setiausaha +setiba +setibanya +setidak +setidaknya +setinggi +seusai +sewaktu +siap +siapa +siapakah +siapapun +siaran +sidang +singapura +sini +sinilah +sistem +soal +soalnya +sokongan +sri +stasiun +suara +suatu +sudah +sudahkah +sudahlah +sukan +suku +sumber +sungai +supaya +surat +susut +syarikat +syed +tadi +tadinya +tahap +tahu +tahun +tak +tama +tambah +tambahnya +tampak +tampaknya +tampil +tan +tanah +tandas +tandasnya +tanggal +tanpa +tanya +tanyakan +tanyanya +tapi +tawaran +tegas +tegasnya +teknologi +telah +televisi +teman +tempat +tempatan +tempo +tempoh +tenaga +tengah +tentang +tentara +tentu +tentulah +tentunya +tepat +terakhir +terasa +terbaik +terbang +terbanyak +terbesar +terbuka +terdahulu +terdapat +terdiri +terhadap +terhadapnya +teringat +terjadi +terjadilah +terjadinya +terkait +terkenal +terkira +terlalu +terlebih +terletak +terlihat +termasuk +ternyata +tersampaikan +tersebut +tersebutlah +tertentu +tertuju +terus +terutama +testimoni +testimony +tetap +tetapi +the +tiada +tiap +tiba +tidak +tidakkah +tidaklah +tidaknya +tiga +tim +timbalan +timur +tindakan +tinggal +tinggi +tingkat +toh +tokoh +try +tun +tunai +tunjuk +turun +turut +tutur +tuturnya +tv +uang +ucap +ucapnya +udara +ujar +ujarnya +umum +umumnya +unescape +ungkap +ungkapnya +unit +universitas +untuk +untung +upaya +urus +usah +usaha +usai +user +utama +utara +var +versi +waduh +wah +wahai +wakil +waktu +waktunya +walau +walaupun +wang +wanita +warga +warta +wib +wilayah +wong +word +ya +yaitu +yakin +yakni +yang +zaman \ No newline at end of file diff --git a/src/main/resources/com/gravity/goose/text/stopwords-it.txt b/src/main/resources/com/gravity/goose/text/stopwords-it.txt new file mode 100644 index 000000000..c14afeb9f --- /dev/null +++ b/src/main/resources/com/gravity/goose/text/stopwords-it.txt @@ -0,0 +1,412 @@ +a +a +abbia +abbiamo +abbiano +abbiate +ad +agl +agl +agli +agli +ai +ai +al +al +all +all +alla +alla +alle +alle +allo +allo +anche +anche +avemmo +avendo +avesse +avessero +avessi +avessimo +aveste +avesti +avete +aveva +avevamo +avevano +avevate +avevi +avevo +avra +avrai +avranno +avrebbe +avrebbero +avrei +avremmo +avremo +avreste +avresti +avrete +avro +avrà +avrò +avuta +avute +avuti +avuto +c +c +che +che +chi +chi +ci +ci +coi +coi +col +col +come +come +con +con +contro +contro +cui +cui +da +da +dagl +dagl +dagli +dagli +dai +dai +dal +dal +dall +dall +dalla +dalla +dalle +dalle +dallo +dallo +degl +degl +degli +degli +dei +dei +del +del +dell +dell +della +della +delle +delle +dello +dello +di +di +dov +dov +dove +dove +e +e +ebbe +ebbero +ebbi +ed +ed +era +erano +eravamo +eravate +eri +ero +essendo +faccia +facciamo +facciano +facciate +faccio +facemmo +facendo +facesse +facessero +facessi +facessimo +faceste +facesti +faceva +facevamo +facevano +facevate +facevi +facevo +fai +fanno +farai +faranno +farebbe +farebbero +farei +faremmo +faremo +fareste +faresti +farete +farà +farò +fece +fecero +feci +fosse +fossero +fossi +fossimo +foste +fosti +fu +fui +fummo +furono +gli +gli +ha +hai +hanno +ho +i +i +il +il +in +in +io +io +l +l +la +la +le +le +lei +lei +li +li +lo +lo +loro +loro +lui +lui +ma +ma +mi +mi +mia +mia +mie +mie +miei +miei +mio +mio +ne +ne +negl +negl +negli +negli +nei +nei +nel +nel +nell +nell +nella +nella +nelle +nelle +nello +nello +noi +noi +non +non +nostra +nostra +nostre +nostre +nostri +nostri +nostro +nostro +o +o +per +per +perche +perchè +perché +piu +più +più +quale +quale +quanta +quanta +quante +quante +quanti +quanti +quanto +quanto +quella +quella +quelle +quelle +quelli +quelli +quello +quello +questa +questa +queste +queste +questi +questi +questo +questo +sarai +saranno +sarebbe +sarebbero +sarei +saremmo +saremo +sareste +saresti +sarete +saro +sarà +sarò +se +se +sei +si +si +sia +siamo +siano +siate +siete +sono +sta +stai +stando +stanno +starai +staranno +starebbe +starebbero +starei +staremmo +staremo +stareste +staresti +starete +starà +starò +stava +stavamo +stavano +stavate +stavi +stavo +stemmo +stesse +stessero +stessi +stessimo +steste +stesti +stette +stettero +stetti +stia +stiamo +stiano +stiate +sto +su +su +sua +sua +sue +sue +sugl +sugl +sugli +sugli +sui +sui +sul +sul +sull +sull +sulla +sulla +sulle +sulle +sullo +sullo +suo +suo +suoi +suoi +ti +ti +tra +tra +tu +tu +tua +tua +tue +tue +tuo +tuo +tuoi +tuoi +tutti +tutti +tutto +tutto +un +un +una +una +uno +uno +vi +vi +voi +voi +vostra +vostra +vostre +vostre +vostri +vostri +vostro +vostro +è +é +ad diff --git a/src/main/resources/com/gravity/goose/text/stopwords-ko.txt b/src/main/resources/com/gravity/goose/text/stopwords-ko.txt new file mode 100644 index 000000000..78bd35e1c --- /dev/null +++ b/src/main/resources/com/gravity/goose/text/stopwords-ko.txt @@ -0,0 +1,70 @@ +가 +같이 +고 +과 +과는 +과를 +과의 +까지 +까지는 +까지의 +께 +나 +는 +다 +대로 +도 +든 +라 +라고 +로 +로는 +로부터 +로의 +를 +만 +만에 +만을 +만의 +만이 +며 +밖에 +보다 +보다는 +부터 +부터는 +아 +야 +에 +에게 +에는 +에도 +에만 +에서 +에서는 +에서도 +에서의 +엔 +여 +와 +와의 +요 +으로 +으로는 +으로부터 +으로써 +으로의 +은 +을 +의 +이 +이고 +이나 +이다 +이라고 +이라는 +이며 +처럼 +치고 +토록 +하고 diff --git a/src/main/resources/com/gravity/goose/text/stopwords-nb.txt b/src/main/resources/com/gravity/goose/text/stopwords-nb.txt new file mode 100644 index 000000000..bb9edb15e --- /dev/null +++ b/src/main/resources/com/gravity/goose/text/stopwords-nb.txt @@ -0,0 +1,117 @@ +alle +andre +arbeid +av +begge +bort +bra +bruke +da +denne +der +deres +det +din +disse +du +eller +en +ene +eneste +enhver +enn +er +et +folk +for +fordi +forsÛke +fra +fÅ +fÛr +fÛrst +gjorde +gjÛre +god +gÅ +ha +hadde +han +hans +hennes +her +hva +hvem +hver +hvilken +hvis +hvor +hvordan +hvorfor +ikke +inn +innen +kan +kunne +lage +lang +lik +like +makt +mange +med +meg +meget +men +mens +mer +mest +min +mye +mÅ +mÅte +navn +nei +ny +nÅ +nÅr +og +ogsÅ +om +opp +oss +over +part +punkt +pÅ +rett +riktig +samme +sant +si +siden +sist +skulle +slik +slutt +som +start +stille +tid +til +tilbake +tilstand +under +ut +uten +var +ved +verdi +vi +vil +ville +vite +vÅr +vÖre +vÖrt +Å diff --git a/src/main/resources/com/gravity/goose/text/stopwords-nl.txt b/src/main/resources/com/gravity/goose/text/stopwords-nl.txt new file mode 100644 index 000000000..300c36833 --- /dev/null +++ b/src/main/resources/com/gravity/goose/text/stopwords-nl.txt @@ -0,0 +1,48 @@ +aan +af +al +als +bij +dan +dat +die +dit +een +en +er +had +heb +hem +het +hij +hoe +hun +ik +in +is +je +kan +me +men +met +mij +nog +nu +of +ons +ook +te +tot +uit +van +was +wat +we +wel +wij +zal +ze +zei +zij +zo +zou diff --git a/src/main/resources/com/gravity/goose/text/stopwords-no.txt b/src/main/resources/com/gravity/goose/text/stopwords-no.txt new file mode 100644 index 000000000..4b14918b2 --- /dev/null +++ b/src/main/resources/com/gravity/goose/text/stopwords-no.txt @@ -0,0 +1,120 @@ +at +av +de +den +der +det +du +en +er +et +for +fra +før +med +og +om +over +på +som +til +ved +år +alle +bare +ble +bort +bra +da +deg +dem +denne +dere +deres +det +dette +din +disse +dit +ditt +eller +ene +enn +er +et +ett +etter +for +fram +først +få +god +gå +ha +han +hans +har +her +hit +hun +hva +hvem +hver +ikke +inn +ja +jeg +kan +kom +kun +kunne +lage +lang +lik +like +man +mer +min +mot +mye +må +måte +ned +nei +noe +noen +ny +nå +når +også +opp +oss +seg +selv +si +siden +sin +sine +sist +skal +skulle +slik +som +så +sånn +tid +til +under +ut +uten +var +ved +vi +vil +vite +vår +å +dei +di +då +eg \ No newline at end of file diff --git a/src/main/resources/com/gravity/goose/text/stopwords-pl.txt b/src/main/resources/com/gravity/goose/text/stopwords-pl.txt new file mode 100644 index 000000000..3451a04d9 --- /dev/null +++ b/src/main/resources/com/gravity/goose/text/stopwords-pl.txt @@ -0,0 +1,353 @@ +a +aby +ach +acz +aczkolwiek +aj +albo +ale +alez +ależ +ani +az +aż +bardziej +bardzo +beda +bedzie +bez +bo +bowiem +by +byc +byl +byla +byli +bylo +byly +bynajmniej +być +był +była +było +były +będzie +będą +cala +cali +caly +cała +cały +ci +cie +ciebie +cię +co +cokolwiek +cos +coś +czasami +czasem +czemu +czy +czyli +daleko +dla +dlaczego +dlatego +do +dobrze +dokad +dokąd +dosc +dość +duzo +dużo +dwa +dwaj +dwie +dwoje +dzis +dzisiaj +dziś +gdy +gdyby +gdyz +gdyż +gdzie +gdziekolwiek +gdzies +gdzieś +go +i +ich +ile +im +inna +inne +inny +innych +iz +iż +ja +jak +jakas +jakaś +jakby +jaki +jakichs +jakichś +jakie +jakis +jakiz +jakiś +jakiż +jakkolwiek +jako +jakos +jakoś +je +jeden +jedna +jednak +jednakze +jednakże +jedno +jego +jej +jemu +jesli +jest +jestem +jeszcze +jezeli +jeśli +jeżeli +juz +już +ją +kazdy +każdy +kiedy +kilka +kims +kimś +kto +ktokolwiek +ktora +ktore +ktorego +ktorej +ktory +ktorych +ktorym +ktorzy +ktos +ktoś +która +które +którego +której +który +których +którym +którzy +ku +lat +lecz +lub +ma +maja +mają +mam +mało +mi +miedzy +mimo +między +mna +mnie +mną +moga +mogą +moi +moim +moj +moja +moje +moze +mozliwe +mozna +może +możliwe +można +mu +musi +my +mój +na +nad +nam +nami +nas +nasi +nasz +nasza +nasze +naszego +naszych +natomiast +natychmiast +nawet +nia +nic +nich +nie +niech +niego +niej +niemu +nigdy +nim +nimi +niz +nią +niż +no +o +obok +od +okolo +około +on +ona +one +oni +ono +oraz +oto +owszem +pan +pana +pani +po +pod +podczas +pomimo +ponad +poniewaz +ponieważ +powinien +powinna +powinni +powinno +poza +prawie +przeciez +przecież +przed +przede +przedtem +przez +przy +roku +rowniez +również +sa +sam +sama +sie +się +skad +skąd +soba +sobie +sobą +sposob +sposób +swoje +są +ta +tak +taka +taki +takie +takze +także +tam +te +tego +tej +temu +ten +teraz +tez +też +to +toba +tobie +tobą +totez +toteż +trzeba +tu +tutaj +twoi +twoim +twoj +twoja +twoje +twym +twój +ty +tych +tylko +tym +u +w +wam +wami +was +wasz +wasza +wasze +we +wedlug +według +wiec +wiecej +wiele +wielu +więc +więcej +wlasnie +wszyscy +wszystkich +wszystkie +wszystkim +wszystko +wtedy +wy +właśnie +z +za +zaden +zadna +zadne +zadnych +zapewne +zawsze +ze +zeby +znow +znowu +znów +zostal +został +zł +żaden +żadna +żadne +żadnych +że +żebya diff --git a/src/main/resources/com/gravity/goose/text/stopwords-pt.txt b/src/main/resources/com/gravity/goose/text/stopwords-pt.txt new file mode 100644 index 000000000..4e0189552 --- /dev/null +++ b/src/main/resources/com/gravity/goose/text/stopwords-pt.txt @@ -0,0 +1,295 @@ +a +acerca +agora +algmas +alguns +ali +ambos +antes +ao +aos +apontar +aquela +aquelas +aquele +aqueles +aqui +aquilo +as +atrás +até +bem +bom +cada +caminho +cima +com +como +comprido +conhecido +corrente +da +das +de +debaixo +dela +delas +dele +deles +dentro +depois +desde +desligado +deve +devem +deverá +direita +diz +dizer +do +dois +dos +e +ela +elas +ele +eles +em +enquanto +entre +então +era +eram +essa +essas +esse +esses +esta +estado +estamos +estar +estará +estas +estava +estavam +este +esteja +estejam +estejamos +estes +esteve +estive +estivemos +estiver +estivera +estiveram +estiverem +estivermos +estivesse +estivessem +estivéramos +estivéssemos +estou +está +estávamos +estão +eu +fará +faz +fazer +fazia +fez +fim +foi +fomos +for +fora +foram +forem +formos +fosse +fossem +fui +fôramos +fôssemos +haja +hajam +hajamos +havemos +hei +horas +houve +houvemos +houver +houvera +houveram +houverei +houverem +houveremos +houveria +houveriam +houvermos +houverá +houverão +houveríamos +houvesse +houvessem +houvéramos +houvéssemos +há +hão +iniciar +inicio +ir +irá +isso +ista +iste +isto +já +lhe +lhes +ligado +maioria +maiorias +mais +mas +me +mesmo +meu +meus +minha +minhas +muito +muitos +na +nas +nem +no +nome +nos +nossa +nossas +nosso +nossos +novo +num +numa +não +nós +o +onde +os +ou +outro +para +parte +pegar +pela +pelas +pelo +pelos +pessoas +pode +poderá +podia +por +porque +povo +promeiro +qual +qualquer +quando +que +quem +quieto +quê +saber +se +seja +sejam +sejamos +sem +ser +serei +seremos +seria +seriam +será +serão +seríamos +seu +seus +somente +somos +sou +sua +suas +são +só +tal +também +te +tem +temos +tempo +tenha +tenham +tenhamos +tenho +tentar +tentaram +tente +tentei +terei +teremos +teria +teriam +terá +terão +teríamos +teu +teus +teve +tinha +tinham +tipo +tive +tivemos +tiver +tivera +tiveram +tiverem +tivermos +tivesse +tivessem +tivéramos +tivéssemos +todos +trabalhar +trabalho +tu +tua +tuas +tém +têm +tínhamos +um +uma +umas +uns +usa +usar +valor +veja +ver +verdade +verdadeiro +você +vocês +vos +à +às +é +éramos +último diff --git a/src/main/resources/com/gravity/goose/text/stopwords-ro.txt b/src/main/resources/com/gravity/goose/text/stopwords-ro.txt new file mode 100644 index 000000000..52001f744 --- /dev/null +++ b/src/main/resources/com/gravity/goose/text/stopwords-ro.txt @@ -0,0 +1,246 @@ +a +abia +acea +aceasta +aceea +aceeasi +aceia +acel +acela +acelasi +acelea +acest +acesta +aceste +acestea +acestei +acestia +acestui +acolo +acum +adica +ai +aia +aici +aiurea +al +ala +alaturi +ale +alt +alta +altceva +alte +altfel +alti +altii +altul +am +anume +apoi +ar +are +as +asa +asemenea +asta +astazi +astfel +asupra +atare +ati +atit +atita +atitea +atitia +atunci +au +avea +avem +avut +azi +b +ba +bine +c +ca +cam +capat +care +careia +carora +caruia +catre +ce +cea +ceea +cei +ceilalti +cel +cele +celor +ceva +chiar +ci +cind +cine +cineva +cit +cita +cite +citeva +citi +citiva +conform +cu +cui +cum +cumva +d +da +daca +dar +dat +de +deasupra +deci +decit +degraba +deja +desi +despre +din +dintr +dintre +doar +dupa +e +ea +ei +el +ele +era +este +eu +exact +f +face +fara +fata +fel +fi +fie +foarte +fost +g +geaba +h +i +ia +iar +ii +il +imi +in +inainte +inapoi +inca +incit +insa +intr +intre +isi +iti +j +k +l +la +le +li +lor +lui +m +ma +mai +mare +mi +mod +mult +multa +multe +multi +n +ne +ni +nici +niciodata +nimeni +nimic +niste +noi +nostri +nou +noua +nu +numai +o +or +ori +orice +oricum +p +pai +parca +pe +pentru +peste +pina +plus +prea +prin +putini +r +s +sa +sai +sale +sau +se +si +sint +sintem +spre +sub +sus +t +te +ti +toata +toate +tocmai +tot +toti +totul +totusi +tu +tuturor +u +un +una +unde +unei +unele +uneori +unii +unor +unui +unul +v +va +voi +vom +vor +vreo +vreun +x +z diff --git a/src/main/resources/com/gravity/goose/text/stopwords-ru.txt b/src/main/resources/com/gravity/goose/text/stopwords-ru.txt new file mode 100644 index 000000000..94984803c --- /dev/null +++ b/src/main/resources/com/gravity/goose/text/stopwords-ru.txt @@ -0,0 +1,421 @@ +а +е +и +ж +м +о +на +не +ни +об +но +он +мне +мои +мож +она +они +оно +мной +много +многочисленное +многочисленная +многочисленные +многочисленный +мною +мой +мог +могут +можно +может +можхо +мор +моя +моё +мочь +над +нее +оба +нам +нем +нами +ними +мимо +немного +одной +одного +менее +однажды +однако +меня +нему +меньше +ней +наверху +него +ниже +мало +надо +один +одиннадцать +одиннадцатый +назад +наиболее +недавно +миллионов +недалеко +между +низко +меля +нельзя +нибудь +непрерывно +наконец +никогда +никуда +нас +наш +нет +нею +неё +них +мира +наша +наше +наши +ничего +начала +нередко +несколько +обычно +опять +около +мы +ну +нх +от +отовсюду +особенно +нужно +очень +отсюда +в +во +вон +вниз +внизу +вокруг +вот +восемнадцать +восемнадцатый +восемь +восьмой +вверх +вам +вами +важное +важная +важные +важный +вдали +везде +ведь +вас +ваш +ваша +ваше +ваши +впрочем +весь +вдруг +вы +все +второй +всем +всеми +времени +время +всему +всего +всегда +всех +всею +всю +вся +всё +всюду +г +год +говорил +говорит +года +году +где +да +ее +за +из +ли +же +им +до +по +ими +под +иногда +довольно +именно +долго +позже +более +должно +пожалуйста +значит +иметь +больше +пока +ему +имя +пор +пора +потом +потому +после +почему +почти +посреди +ей +два +две +двенадцать +двенадцатый +двадцать +двадцатый +двух +его +дел +или +без +день +занят +занята +занято +заняты +действительно +давно +девятнадцать +девятнадцатый +девять +девятый +даже +алло +жизнь +далеко +близко +здесь +дальше +для +лет +зато +даром +первый +перед +затем +зачем +лишь +десять +десятый +ею +её +их +бы +еще +при +был +про +процентов +против +просто +бывает +бывь +если +люди +была +были +было +будем +будет +будете +будешь +прекрасно +буду +будь +будто +будут +ещё +пятнадцать +пятнадцатый +друго +другое +другой +другие +другая +других +есть +пять +быть +лучше +пятый +к +ком +конечно +кому +кого +когда +которой +которого +которая +которые +который +которых +кем +каждое +каждая +каждые +каждый +кажется +как +какой +какая +кто +кроме +куда +кругом +с +т +у +я +та +те +уж +со +то +том +снова +тому +совсем +того +тогда +тоже +собой +тобой +собою +тобою +сначала +только +уметь +тот +тою +хорошо +хотеть +хочешь +хоть +хотя +свое +свои +твой +своей +своего +своих +свою +твоя +твоё +раз +уже +сам +там +тем +чем +сама +сами +теми +само +рано +самом +самому +самой +самого +семнадцать +семнадцатый +самим +самими +самих +саму +семь +чему +раньше +сейчас +чего +сегодня +себе +тебе +сеаой +человек +разве +теперь +себя +тебя +седьмой +спасибо +слишком +так +такое +такой +такие +также +такая +сих +тех +чаще +четвертый +через +часто +шестой +шестнадцать +шестнадцатый +шесть +четыре +четырнадцать +четырнадцатый +сколько +сказал +сказала +сказать +ту +ты +три +эта +эти +что +это +чтоб +этом +этому +этой +этого +чтобы +этот +стал +туда +этим +этими +рядом +тринадцать +тринадцатый +этих +третий +тут +эту +суть +чуть +тысяч diff --git a/src/main/resources/com/gravity/goose/text/stopwords-sv.txt b/src/main/resources/com/gravity/goose/text/stopwords-sv.txt new file mode 100644 index 000000000..74c0a895f --- /dev/null +++ b/src/main/resources/com/gravity/goose/text/stopwords-sv.txt @@ -0,0 +1,547 @@ +#----------------------------------------------------------------------- +# translated +#----------------------------------------------------------------------- + +kunna +om +ovan +enligt +i enlighet med detta +över +faktiskt +efter +efteråt +igen +mot +är inte +alla +tillåta +tillåter +nästan +ensam +längs +redan +också +även om +alltid +am +bland +bland +en +och +en annan +någon +någon +hur som helst +någon +något +ändå +ändå +var som helst +isär +visas +uppskatta +lämpligt +är +inte +runt +som +åt sidan +be +frågar +associerad +vid +tillgängliga +bort +väldigt +vara +blev +eftersom +bli +blir +blir +varit +innan +förhand +bakom +vara +tro +nedan +bredvid +förutom +bäst +bättre +mellan +bortom +både +kort +men +genom +c +c'mon +c: s +kom +kampanj +kan +kan inte +kan inte +cant +orsaka +orsaker +viss +säkerligen +förändringar +klart +co +com +komma +kommer +om +följaktligen +överväga +överväger +innehålla +innehållande +innehåller +motsvarande +kunde +kunde inte +kurs +närvarande +definitivt +beskrivits +trots +gjorde +inte +olika +göra +gör +inte +gör +inte +gjort +ned +nedåt +under +varje +edu +åtta +antingen +annars +någon annanstans +tillräckligt +godkändes +helt +speciellt +et +etc +även +någonsin +varje +alla +alla +allt +överallt +ex +exakt +exempel +utom +långt +få +femte +först +finansiella +fem +följt +efter +följer +för +fd +tidigare +framåt +fyra +från +ytterligare +dessutom +få +blir +få +given +ger +gå +går +gå +borta +fick +fått +hälsningar +hade +hade inte +händer +knappast +har +har inte +ha +har inte +med +han +han är +hallå +hjälpa +hence +henne +här +här finns +härefter +härmed +häri +härpå +hennes +själv +hej +honom +själv +hans +hit +förhoppningsvis +hur +howbeit +dock +jag skulle +jag ska +jag är +jag har +om +ignoreras +omedelbar +i +eftersom +inc +indeed +indikera +indikerade +indikerar +inre +mån +istället +in +inåt +är +är inte +den +det skulle +det ska +det är +dess +själv +bara +hålla +håller +hålls +vet +vet +känd +sista +nyligen +senare +senare +latterly +minst +mindre +lest +låt +låt oss +liknande +gillade +sannolikt +lite +ser +ser +ser +ltd +huvudsakligen +många +kan +kanske +mig +betyda +under tiden +endast +kanske +mer +dessutom +mest +mestadels +mycket +måste +min +själv +namn +nämligen +nd +nära +nästan +nödvändigt +behöver +behov +varken +aldrig +ändå +ny +nästa +nio +ingen +ingen +icke +ingen +ingen +eller +normalt +inte +ingenting +roman +nu +ingenstans +uppenbarligen +av +off +ofta +oh +ok +okay +gammal +på +en gång +ett +ettor +endast +på +eller +andra +andra +annars +borde +vår +vårt +oss +ut +utanför +över +övergripande +egen +särskilt +särskilt +per +kanske +placeras +vänligen +plus +möjligt +förmodligen +förmodligen +ger +ganska +citera +kvartalsvis +snarare +verkligen +rimligen +om +oavsett +gäller +relativt +respektive +höger +sa +samma +såg +säga +säger +säger +andra +det andra +se +ser +verkar +verkade +informationsproblem +verkar +sett +själv +själva +förnuftig +skickas +allvarlig +allvarligt +sju +flera +skall +hon +bör +bör inte +eftersom +sex +så +några +någon +på något sätt +någon +något +sometime +ibland +något +någonstans +snart +sorry +specificerade +ange +ange +fortfarande +sub +sådan +sup +säker +t s +ta +tas +berätta +tenderar +än +tacka +tack +thanx +att +det är +brinner +den +deras +deras +dem +själva +sedan +därifrån +där +det finns +därefter +därigenom +därför +däri +theres +därpå +dessa +de +de hade +de kommer +de är +de har +tror +tredje +detta +grundlig +grundligt +de +though +tre +genom +hela +thru +sålunda +till +tillsammans +alltför +tog +mot +mot +försökte +försöker +verkligt +försök +försöker +två gånger +två +enligt +tyvärr +såvida inte +osannolikt +tills +åt +upp +på +oss +använda +används +användbar +använder +användning +vanligtvis +uucp +värde +olika +mycket +via +viz +vs +vill +vill +var +var inte +sätt +vi +vi skulle +vi kommer +vi är +vi har +välkommen +väl +gick +var +var inte +vad +vad är +oavsett +när +varifrån +närhelst +där +var är +varefter +medan +varigenom +vari +varpå +varhelst +huruvida +som +medan +dit +som +vem är +vem +hela +vem +vars +varför +kommer +villig +önskar +med +inom +utan +kommer inte +undrar +skulle +skulle inte +ja +ännu +ni +du skulle +kommer du +du är +du har +din +själv +er +noll +tjänsteman +skarpt +kritiserade diff --git a/src/main/resources/com/gravity/goose/text/stopwords-zh.txt b/src/main/resources/com/gravity/goose/text/stopwords-zh.txt new file mode 100644 index 000000000..955ff2b05 --- /dev/null +++ b/src/main/resources/com/gravity/goose/text/stopwords-zh.txt @@ -0,0 +1,125 @@ +的 +一 +不 +在 +人 +有 +是 +为 +以 +于 +上 +他 +而 +后 +之 +来 +及 +了 +因 +下 +可 +到 +由 +这 +与 +也 +此 +但 +并 +个 +其 +已 +无 +小 +我 +们 +起 +最 +再 +今 +去 +好 +只 +又 +或 +很 +亦 +某 +把 +那 +你 +乃 +它 +吧 +被 +比 +别 +趁 +当 +从 +到 +得 +打 +凡 +儿 +尔 +该 +各 +给 +跟 +和 +何 +还 +即 +几 +既 +看 +据 +距 +靠 +啦 +了 +另 +么 +每 +们 +嘛 +拿 +哪 +那 +您 +凭 +且 +却 +让 +仍 +啥 +如 +若 +使 +谁 +虽 +随 +同 +所 +她 +哇 +嗡 +往 +哪 +些 +向 +沿 +哟 +用 +于 +咱 +则 +怎 +曾 +至 +致 +着 +诸 +自 \ No newline at end of file diff --git a/src/main/scala/com/gravity/goose/Article.scala b/src/main/scala/com/gravity/goose/Article.scala index 40b4dadcd..13bc326a2 100644 --- a/src/main/scala/com/gravity/goose/Article.scala +++ b/src/main/scala/com/gravity/goose/Article.scala @@ -19,100 +19,134 @@ package com.gravity.goose import images.Image -import org.jsoup.nodes.{Element, Document} +import org.jsoup.nodes.{ Element, Document } import java.util.Date import scala.collection._ +import beans.BeanProperty +import com.gravity.goose.opengraph.OpenGraphData /** -* Created by Jim Plush -* User: jim -* Date: 8/14/11 -*/ + * Created by Jim Plush + * User: jim + * Date: 8/14/11 + */ class Article { /** - * title of the article - */ + * title of the article + */ + @BeanProperty var title: String = null /** - * stores the lovely, pure text from the article, stripped of html, formatting, etc... - * just raw text with paragraphs separated by newlines. This is probably what you want to use. - */ + * stores the lovely, pure text from the article, stripped of html, formatting, etc... + * just raw text with paragraphs separated by newlines. This is probably what you want to use. + */ + @BeanProperty var cleanedArticleText: String = "" /** - * meta description field in HTML source - */ + * article with the originals HTML tags (

, , ..) + */ + var htmlArticle: String = "" + + /** + * meta description field in HTML source + */ + @BeanProperty var metaDescription: String = "" /** - * meta keywords field in the HTML source - */ + * meta keywords field in the HTML source + */ + @BeanProperty var metaKeywords: String = "" /** - * The canonical link of this article if found in the meta data - */ + * The canonical link of this article if found in the meta data + */ + @BeanProperty var canonicalLink: String = "" /** - * holds the domain of this article we're parsing - */ + * holds the domain of this article we're parsing + */ + @BeanProperty var domain: String = "" /** - * holds the top Element we think is a candidate for the main body of the article - */ + * holds the top Element we think is a candidate for the main body of the article + */ + @BeanProperty var topNode: Element = null /** - * holds the top Image object that we think represents this article - */ + * holds the top Image object that we think represents this article + */ + @BeanProperty var topImage: Image = new Image + /** + * all article images in the order they were found + */ + @BeanProperty + var allImages: List[Image] = Nil /** - * holds a set of tags that may have been in the artcle, these are not meta keywords - */ + * holds a set of tags that may have been in the artcle, these are not meta keywords + */ + @BeanProperty var tags: Set[String] = null /** - * holds a list of any movies we found on the page like youtube, vimeo - */ + * holds a list of links in the article + */ + @BeanProperty + var links: List[Map[String, String]] = Nil + + /** + * holds a list of any movies we found on the page like youtube, vimeo + */ + @BeanProperty var movies: List[Element] = Nil /** - * stores the final URL that we're going to try and fetch content against, this would be expanded if any - * escaped fragments were found in the starting url - */ - var finalUrl: String = ""; + * stores the final URL that we're going to try and fetch content against, this would be expanded if any + * escaped fragments were found in the starting url + */ + @BeanProperty + var finalUrl: String = "" /** - * stores the MD5 hash of the url to use for various identification tasks - */ - var linkhash: String = ""; + * stores the MD5 hash of the url to use for various identification tasks + */ + @BeanProperty + var linkhash: String = "" /** - * stores the RAW HTML straight from the network connection - */ + * stores the RAW HTML straight from the network connection + */ + @BeanProperty var rawHtml: String = "" /** - * the JSoup Document object - */ + * the JSoup Document object + */ + @BeanProperty var doc: Document = null /** - * this is the original JSoup document that contains a pure object from the original HTML without any cleaning - * options done on it - */ + * this is the original JSoup document that contains a pure object from the original HTML without any cleaning + * options done on it + */ + @BeanProperty var rawDoc: Document = null /** - * Sometimes useful to try and know when the publish date of an article was - */ + * Sometimes useful to try and know when the publish date of an article was + */ + @BeanProperty var publishDate: Date = null /** @@ -121,5 +155,38 @@ class Article { * which is executed before document cleansing within {@link com.gravity.goose.CrawlingActor#crawl} * @return a {@link Map Map<String,String>} of property name to property vaue (represented as a {@link String}. */ + @BeanProperty var additionalData: Map[String, String] = Map.empty -} \ No newline at end of file + + /** + * Facebook Open Graph data that that is found in Article Meta tags + */ + var openGraphData: OpenGraphData = null + + override def toString = + fields.filterNot(_._1=="rawHtml").filterNot(_._1=="doc").filterNot(_._1=="rawDoc").mkString("\n") +// s"""Article{ +// title=$title, +// finalUrl=$finalUrl, +// cleanedArticleText=$cleanedArticleText, +// topImage=$topImage, +// tags=$tags, +// openGraphData=${openGraphData.values.mkString("\n ")}, +// metaDescription=$metaDescription, +// metaKeywords=[$metaKeywords], +// canonicalLink=$canonicalLink, +// allImages=$allImages, +// additionalData=$additionalData +//}""" + def fields = { + import reflect.runtime.universe._ + import reflect.runtime.currentMirror + + val r = currentMirror.reflect(this) + r.symbol.typeSignature.members.toStream + .collect { case s: TermSymbol if !s.isMethod => r.reflectField(s) } + .map{r => + r.symbol.name.toString.trim -> (if(r.get==null) "" else r.get.toString) + }.toMap + } +} diff --git a/src/main/scala/com/gravity/goose/Configuration.scala b/src/main/scala/com/gravity/goose/Configuration.scala index 20ce4653a..2cba5b825 100644 --- a/src/main/scala/com/gravity/goose/Configuration.scala +++ b/src/main/scala/com/gravity/goose/Configuration.scala @@ -18,12 +18,25 @@ package com.gravity.goose -import network.{HtmlFetcher, AbstractHtmlFetcher} +import scala.collection.JavaConversions._ +import network.{ HtmlFetcher, AbstractHtmlFetcher } import org.jsoup.nodes.Element -import java.util.Date -import reflect.BeanProperty -import com.gravity.goose.extractors.{StandardContentExtractor, ContentExtractor, AdditionalDataExtractor, PublishDateExtractor} - +import com.github.nscala_time.time.Imports._ +import scala.beans.BeanProperty +import com.gravity.goose.extractors._ +import java.net.URL +import org.apache.http.util.EntityUtils +import org.apache.http.HttpEntity + +object Language { + object English extends Language("en") + object Chinese extends Language("zh") + object Korean extends Language("kr") + object Arabic extends Language("ar") +} +case class Language(lang: String) + +import Language._ /** * Created by Jim Plush @@ -31,53 +44,87 @@ import com.gravity.goose.extractors.{StandardContentExtractor, ContentExtractor, * Date: 8/16/11 */ - -class Configuration { +case class Configuration( + /** + * Local storage path used to place images to inspect them, should be writable + */ + @BeanProperty var language: Language = Language.English, /** - * this is the local storage path used to place images to inspect them, should be writable - */ - @BeanProperty - var localStoragePath: String = "/tmp/goose" + * this is the local storage path used to place images to inspect them, should be writable + */ + @BeanProperty var localStoragePath: String = "/tmp/goose", /** - * What's the minimum bytes for an image we'd accept is, alot of times we want to filter out the author's little images - * in the beginning of the article - */ - @BeanProperty - var minBytesForImages: Int = 4500 + * What's the minimum bytes for an image we'd accept is, alot of times we want to filter out the author's little images + * in the beginning of the article + */ + @BeanProperty var minBytesForImages: Int = 4500, /** - * set this guy to false if you don't care about getting images, otherwise you can either use the default - * image extractor to implement the ImageExtractor interface to build your own - */ - @BeanProperty - var enableImageFetching: Boolean = true + * Minimum legal height for an image - smaller than this considered unusable/undesirable + */ + @BeanProperty var minWidth: Int = 120, /** - * path to your imagemagick convert executable, on the mac using mac ports this is the default listed - */ - @BeanProperty - var imagemagickConvertPath: String = "/opt/local/bin/convert" + * Minimum legal width for an image - smaller than this considered unusable/undesirable + */ + @BeanProperty var minHeight: Int = 120, /** - * path to your imagemagick identify executable - */ - @BeanProperty - var imagemagickIdentifyPath: String = "/opt/local/bin/identify" - - @BeanProperty - var connectionTimeout: Int = 10000 + * set this guy to false if you don't care about getting images, otherwise you can either use the default + * image extractor to implement the ImageExtractor interface to build your own + */ + @BeanProperty var enableImageFetching: Boolean = true, + /** + * set this guy to false if you don't care about getting All images, otherwise you can either use the default + * image extractor to implement the ImageExtractor interface to build your own + */ + @BeanProperty var enableAllImagesFetching: Boolean = true, + + /** + * path to your imagemagick convert executable, on the mac using mac ports this is the default listed + */ + @BeanProperty + //var imagemagickConvertPath: String = "/usr/local/bin/convert", + //var imagemagickConvertPath: String = sys.env.get("GOOSE_IMGMAGICK_CONVERT_PATH").getOrElse("/opt/local/bin/convert"), + var imagemagickConvertPath: String = sys.env.get("GOOSE_IMGMAGICK_CONVERT_PATH").getOrElse("convert"), + + /** + * path to your imagemagick identify executable + */ + @BeanProperty + //var imagemagickIdentifyPath: String = "/usr/local/bin/identify", + //var imagemagickIdentifyPath: String = "identify", + //var imagemagickIdentifyPath: String = sys.env.get("GOOSE_IMGMAGICK_IDENTIFY_PATH").getOrElse("/opt/local/bin/identify"), + var imagemagickIdentifyPath: String = sys.env.get("GOOSE_IMGMAGICK_IDENTIFY_PATH").getOrElse("identify"), + + @BeanProperty var connectionTimeout: Int = 10000 // 10 seconds + , + + @BeanProperty var socketTimeout: Int = 10000 // 10 seconds + , + + @BeanProperty var imageConnectionTimeout: Int = 2000 // 2 seconds + , + + @BeanProperty var imageSocketTimeout: Int = 5000 // 5 seconds + , - @BeanProperty - var socketTimeout: Int = 10000 + /** + * used as the user agent that is sent with your web requests to extract an article + */ + @BeanProperty var browserUserAgent: String = "Mozilla/5.0 (X11; U; Linux x86_64; de; rv:1.9.2.8) Gecko/20100723 Ubuntu/10.04 (lucid) Firefox/3.6.8", /** - * used as the user agent that is sent with your web requests to extract an article - */ - @BeanProperty - var browserUserAgent: String = "Mozilla/5.0 (X11; U; Linux x86_64; de; rv:1.9.2.8) Gecko/20100723 Ubuntu/10.04 (lucid) Firefox/3.6.8" + * sent as the referer header + */ + @BeanProperty var browserReferer: String = "https://www.google.com") { var contentExtractor: ContentExtractor = StandardContentExtractor var publishDateExtractor: PublishDateExtractor = new PublishDateExtractor { - def extract(rootElement: Element): Date = { + def extract(rootElement: Element): DateTime = { + // Try to retrieve publish time from open graph data + val dateParser = org.joda.time.format.ISODateTimeFormat.dateTimeParser + for (el <- rootElement.select("meta[property=article:published_time]")) + return dateParser.parseDateTime(el.attr("content")) null } } @@ -93,10 +140,10 @@ class Configuration { } /** - * Pass in to extract article publish dates. - * @param extractor a concrete instance of {@link PublishDateExtractor} - * @throws IllegalArgumentException if the instance passed in is null - */ + * Pass in to extract article publish dates. + * @param extractor a concrete instance of {@link PublishDateExtractor} + * @throws IllegalArgumentException if the instance passed in is null + */ def setPublishDateExtractor(extractor: PublishDateExtractor) { if (extractor == null) throw new IllegalArgumentException("extractor must not be null!") this.publishDateExtractor = extractor @@ -107,14 +154,20 @@ class Configuration { } /** - * Pass in to extract any additional data not defined within {@link Article} - * @param extractor a concrete instance of {@link AdditionalDataExtractor} - * @throws IllegalArgumentException if the instance passed in is null - */ + * Pass in to extract any additional data not defined within {@link Article} + * @param extractor a concrete instance of {@link AdditionalDataExtractor} + * @throws IllegalArgumentException if the instance passed in is null + */ def setAdditionalDataExtractor(extractor: AdditionalDataExtractor) { this.additionalDataExtractor = extractor } + var openGraphDataExtractor: OpenGraphDataExtractor = new OpenGraphDataExtractor + + def getOpenGraphDataExtractor: OpenGraphDataExtractor = { + openGraphDataExtractor + } + var htmlFetcher: AbstractHtmlFetcher = HtmlFetcher def setHtmlFetcher(fetcher: AbstractHtmlFetcher) { @@ -124,4 +177,52 @@ class Configuration { def getHtmlFetcher: AbstractHtmlFetcher = htmlFetcher -} \ No newline at end of file + // Refactory this in a YML file (like Ruby) + def resolveCharSet(url: String, entity: HttpEntity): String = { + // if (contentType == null) { + // encodingType = "UTF-8" + // } else { + // encodingType = contentType.getCharset().name + // } + + /* from andhapp@github + import org.mozilla.universalchardet.UniversalDetector + var encodingType: String = "UTF-8" + try { + encodingType = EntityUtils.getContentCharSet(entity) + + if (encodingType == null) { + + val buf: Array[Byte] = new Array[Byte](2048) + var instream2: InputStream = new ByteArrayInputStream(responseBytes) + var bytesRead: Int = 2048 + var inLoop = true + + detector = new UniversalDetector(null); + + while (inLoop) { + var n: Int = instream2.read(buf) + bytesRead += 2048 + + if (n < 0) inLoop = false + if (inLoop && !detector.isDone()) { + detector.handleData(buf, 0, n) + } + } + + detector.dataEnd() + encodingType = detector.getDetectedCharset() + println("The encoding: " + encodingType) + detector.reset() + } +*/ + var host = new URL(url).getHost() + + host match { + case "www1.folha.uol.com.br" => return "ISO-8859-1" + case "espn.estadao.com.br" => return "ISO-8859-1" + case _ => return Option(EntityUtils.getContentCharSet(entity)) getOrElse "UTF-8" + } + } + +} diff --git a/src/main/scala/com/gravity/goose/Crawler.scala b/src/main/scala/com/gravity/goose/Crawler.scala index 4f3b32344..5e15b4e95 100644 --- a/src/main/scala/com/gravity/goose/Crawler.scala +++ b/src/main/scala/com/gravity/goose/Crawler.scala @@ -18,15 +18,16 @@ package com.gravity.goose -import cleaners.{StandardDocumentCleaner, DocumentCleaner} +import cleaners.{ StandardDocumentCleaner, DocumentCleaner } import extractors.ContentExtractor -import images.{Image, UpgradedImageIExtractor, ImageExtractor} +import images.{ Image, UpgradedImageIExtractor, ImageExtractor } import org.apache.http.client.HttpClient -import org.jsoup.nodes.{Document, Element} +import org.jsoup.nodes.{ Document, Element } import org.jsoup.Jsoup import java.io.File -import utils.{ParsingCandidate, URLHelper, Logging} -import com.gravity.goose.outputformatters.{StandardOutputFormatter, OutputFormatter} +import utils.{ ParsingCandidate, URLHelper, Logging } +import com.gravity.goose.outputformatters.{ StandardOutputFormatter, OutputFormatter } +import scala.collection.JavaConversions._ /** * Created by Jim Plush @@ -34,7 +35,19 @@ import com.gravity.goose.outputformatters.{StandardOutputFormatter, OutputFormat * Date: 8/18/11 */ -case class CrawlCandidate(config: Configuration, url: String, rawHTML: String = null) +/** + * Represents the information we may know of a page we crawl. + * + * @param config the configuration. + * @param url the URL of the page. + * @param rawHTML the raw HTML page source -- optional. If not specified, and + * fetching is configured in {@code config}, the page will be + * downloaded. + * @param lang the surmised language of the page -- optional. Used as a fallback + * when the page does not report its language. + */ +case class CrawlCandidate(config: Configuration, url: String, + rawHTML: String = null, lang: String = null) class Crawler(config: Configuration) { @@ -46,6 +59,7 @@ class Crawler(config: Configuration) { parseCandidate <- URLHelper.getCleanedUrl(crawlCandidate.url) rawHtml <- getHTML(crawlCandidate, parseCandidate) doc <- getDocument(parseCandidate.url.toString, rawHtml) + lang = crawlCandidate.lang } { trace("Crawling url: " + parseCandidate.url) @@ -58,24 +72,28 @@ class Crawler(config: Configuration) { article.linkhash = parseCandidate.linkhash article.rawHtml = rawHtml article.doc = doc - article.rawDoc = doc.clone() + article.rawDoc = doc.clone article.title = extractor.getTitle(article) - article.publishDate = config.publishDateExtractor.extract(doc) + article.publishDate = Option(config.publishDateExtractor.extract(doc)).map(_.toDate).getOrElse(null) article.additionalData = config.getAdditionalDataExtractor.extract(doc) article.metaDescription = extractor.getMetaDescription(article) article.metaKeywords = extractor.getMetaKeywords(article) article.canonicalLink = extractor.getCanonicalLink(article) article.tags = extractor.extractTags(article) + article.openGraphData = config.getOpenGraphDataExtractor.extract(doc) // before we do any calcs on the body itself let's clean up the document article.doc = docCleaner.clean(article) + if (article.publishDate == null) { + article.publishDate = extractor.getDateFromURL(article.canonicalLink) + } - - extractor.calculateBestNodeBasedOnClustering(article) match { + // extractor.calculateBestNodeBasedOnClustering(article, config.language) match { + extractor.calculateBestNodeBasedOnClustering(article, lang) match { case Some(node: Element) => { - article.topNode = node - article.movies = extractor.extractVideos(article.topNode) + article.movies = extractor.extractVideos(node) + article.links = extractor.extractLinks(node) if (config.enableImageFetching) { trace(logPrefix + "Image fetching enabled...") @@ -83,28 +101,31 @@ class Crawler(config: Configuration) { try { if (article.rawDoc == null) { article.topImage = new Image + article.allImages = Nil } else { - article.topImage = imageExtractor.getBestImage(article.rawDoc, article.topNode) + if (config.enableAllImagesFetching) { + article.topImage = imageExtractor.getBestImage(article.rawDoc, node) + article.allImages = imageExtractor.getAllImages(node) + } } } catch { case e: Exception => { - warn(e, e.toString) + warn(e, e.getMessage) + throw e } } } - article.topNode = extractor.postExtractionCleanup(article.topNode) - + article.topNode = extractor.postExtractionCleanup(node, lang) + article.cleanedArticleText = outputFormatter.getFormattedText(node, lang) + article.htmlArticle = outputFormatter.cleanupHtml(node, lang) - - article.cleanedArticleText = outputFormatter.getFormattedText(article.topNode) } case _ => trace("NO ARTICLE FOUND") } releaseResources(article) article } - article } @@ -121,7 +142,6 @@ class Crawler(config: Configuration) { } } - def getImageExtractor(article: Article): ImageExtractor = { val httpClient: HttpClient = config.getHtmlFetcher.getHttpClient new UpgradedImageIExtractor(httpClient, article, config) @@ -138,7 +158,7 @@ class Crawler(config: Configuration) { def getDocument(url: String, rawlHtml: String): Option[Document] = { try { - Some(Jsoup.parse(rawlHtml)) + Some(Jsoup.parse(rawlHtml, url)) } catch { case e: Exception => { trace("Unable to parse " + url + " properly into JSoup Doc") @@ -152,26 +172,32 @@ class Crawler(config: Configuration) { } /** - * cleans up any temp files we have laying around like temp images - * removes any image in the temp dir that starts with the linkhash of the url we just parsed - */ + * cleans up any temp files we have laying around like temp images + * removes any image in the temp dir that starts with the linkhash of the url we just parsed + */ def releaseResources(article: Article) { trace(logPrefix + "STARTING TO RELEASE ALL RESOURCES") - - val dir: File = new File(config.localStoragePath) - - dir.list.foreach(filename => { - if (filename.startsWith(article.linkhash)) { - val f: File = new File(dir.getAbsolutePath + "/" + filename) - if (!f.delete) { - warn("Unable to remove temp file: " + filename) + if (config.getEnableImageFetching) { + val dir: File = new File(config.localStoragePath) + if (dir.isDirectory && dir.exists) { + val list = dir.list + if (list == null) { + throw new RuntimeException(s"Can't list dir ${dir.getAbsolutePath}") } + list.foreach(filename => { + if (filename.startsWith(article.linkhash)) { + val f: File = new File(dir.getAbsolutePath + "/" + filename) + if (!f.delete) { + warn("Unable to remove temp file: " + filename) + } + } + }) } - }) + } } } object Crawler extends Logging { val logPrefix = "crawler: " -} \ No newline at end of file +} diff --git a/src/main/scala/com/gravity/goose/FetchMany.scala b/src/main/scala/com/gravity/goose/FetchMany.scala new file mode 100644 index 000000000..0fcbaf917 --- /dev/null +++ b/src/main/scala/com/gravity/goose/FetchMany.scala @@ -0,0 +1,51 @@ +package com.gravity.goose + +import scala.io.Source +import sys.process._ + +object FetchMany { + def main(args: Array[String]) { + try { + val config: Configuration = new Configuration + config.enableImageFetching = true + config.imagemagickConvertPath = "/usr/bin/convert" + config.imagemagickIdentifyPath = "/usr/bin/identify" + config.localStoragePath = "/tmp/goose" + config.minBytesForImages = 4500 + val goose = new Goose(config) + + var i = 0 + for(line <- Source.fromFile(args(0) + "urllist.txt").getLines()) { + val out = new java.io.FileWriter(args(0) + i) + val url: String = line + println("FETCH: Goose is fetching into " + i + ": " + url) + var done: Boolean = false + for(attempt <- 1 to 5) { + try { + if(!done) { + println("FETCH: -- Attempt " + attempt) + val article = goose.extractContent(url) + println("FETCH: -- Got: " + article.title) + out.write(article.cleanedArticleText + "\n" + article.topImage.imageSrc + "\n" + article.title) + done = true + } + } + catch { + case e: Exception => { + e.printStackTrace() + } + } + } + out.close + i = i + 1 + } + } + catch { + case e: Exception => { + e.printStackTrace(); + } + } + } +} + + diff --git a/src/main/scala/com/gravity/goose/Goose.scala b/src/main/scala/com/gravity/goose/Goose.scala index 23f724bdb..b8ff0c7ef 100644 --- a/src/main/scala/com/gravity/goose/Goose.scala +++ b/src/main/scala/com/gravity/goose/Goose.scala @@ -20,27 +20,32 @@ package com.gravity.goose import network.HtmlFetcher import java.io.File +import org.apache.commons.lang.NotImplementedException /** * Created by Jim Plush - Gravity.com * Date: 8/14/11 */ -class Goose(config: Configuration = new Configuration) { - - - initializeEnvironment() +class Goose(var config : Configuration = new Configuration) { + def setConfig(configuration: Configuration) = { + config = configuration + if (configuration.getEnableImageFetching) throw new NotImplementedException("image fetching should be rewritten before it can be used in GAE") + } /** - * Main method to extract an article object from a URL, pass in a url and get back a Article - * @url The url that you want to extract + * Main method to extract an article object from a URL, pass in a url and get + * back an Article. + * + * @param url the URL of the page. + * @param rawHTML the raw HTML page source -- optional. If not specified, and + * fetching is configured in {@code config}, the page will be + * downloaded. + * @param lang the surmised language of the page -- optional. Used as a fallback + * when the page does not report its language. */ - def extractContent(url: String, rawHTML: String): Article = { - val cc = new CrawlCandidate(config, url, rawHTML) - sendToActor(cc) - } - - def extractContent(url: String): Article = { - val cc = new CrawlCandidate(config, url, null) + def extractContent(url: String, + rawHTML: String = null, lang: String = "all"): Article = { + val cc = new CrawlCandidate(config, url, rawHTML, lang) sendToActor(cc) } @@ -57,20 +62,15 @@ class Goose(config: Configuration = new Configuration) { def initializeEnvironment() { val f = new File(config.localStoragePath) - try { if (!f.isDirectory) { f.mkdirs() } - } catch { - case e: Exception => - } if (!f.isDirectory) { throw new Exception(config.localStoragePath + " directory does not seem to exist, you need to set this for image processing downloads") } if (!f.canWrite) { throw new Exception(config.localStoragePath + " directory is not writeble, you need to set this for image processing downloads") } - // todo cleanup any jank that may be in the tmp folder currently } diff --git a/src/main/scala/com/gravity/goose/JsonMain.scala b/src/main/scala/com/gravity/goose/JsonMain.scala new file mode 100644 index 000000000..ca60b9e35 --- /dev/null +++ b/src/main/scala/com/gravity/goose/JsonMain.scala @@ -0,0 +1,69 @@ +package com.gravity.goose + +import scala.collection.mutable.Map + +import com.gravity.goose.util.JsonUtil + +object JsonMain { + + def main(args: Array[String]) { + try { + val url = args(0) + val json = getArticleAsJson(url) + println(json) + } catch { + case e: Exception => e.printStackTrace() + } + } + def getArticleAsJson(url: String) = { + val map = Map[String, Any]() + println("read article from [" + url + "]") + if (url == null || url.length == 0) { + map.put("error", true) + map.put("message", "No URL specified") + } else { + val config = new Configuration + config.setImagemagickConvertPath("/usr/bin/convert") + config.setImagemagickIdentifyPath("/usr/bin/identify") + config.setLocalStoragePath("./storage") + config.setMinBytesForImages(500) + val goose = new Goose(config) + val article = goose.extractContent(url) + map.put("success", true) + + + map.put("title", encodeHTML(article.getTitle)) + val image = article.getTopImage + if (image != null) { + map.put("image", article.getTopImage.getImageSrc) + } + map.put("images", article.getAllImages) + map.put("movies", article.getMovies) + map.put("link", article.getCanonicalLink) + map.put("tags", article.getTags) + map.put("text", encodeHTML(article.getCleanedArticleText)) + map.put("date", article.getPublishDate) + map.put("desc", encodeHTML(article.getMetaDescription)) + map.put("keywords", article.getMetaKeywords) + } + val responseString = JsonUtil.toJson(map) + println(responseString) + responseString + } + + def encodeHTML(s: String): String = { + if (s == null) { + return "" + } + val out = new StringBuffer() + for (i <- 0 until s.length) { + val c = s.charAt(i) + if (c > 127 || c == '"' || c == '<' || c == '>') { + out.append("&#" + c.toInt + ";") + } else { + out.append(c) + } + } + out.toString + } +} diff --git a/src/main/scala/com/gravity/goose/TalkToMeGoose.scala b/src/main/scala/com/gravity/goose/TalkToMeGoose.scala index fba111b88..8dcddad39 100644 --- a/src/main/scala/com/gravity/goose/TalkToMeGoose.scala +++ b/src/main/scala/com/gravity/goose/TalkToMeGoose.scala @@ -1,42 +1,54 @@ package com.gravity.goose -/** - * Created by Jim Plush - * User: jim - * Date: 5/13/11 - */ +import java.io._ +import scala.collection.JavaConversions._ +import scala.io.Source + object TalkToMeGoose { /** - * you can use this method if you want to run goose from the command line to extract html from a bashscript - * or to just test it's functionality - * you can run it like so - * cd into the goose root - * mvn compile - * MAVEN_OPTS="-Xms256m -Xmx2000m"; mvn exec:java -Dexec.mainClass=com.gravity.goose.TalkToMeGoose -Dexec.args="http://techcrunch.com/2011/05/13/native-apps-or-web-apps-particle-code-wants-you-to-do-both/" -e -q > ~/Desktop/gooseresult.txt - * - * Some top gun love: - * Officer: [in the midst of the MIG battle] Both Catapults are broken, sir. - * Stinger: How long will it take? - * Officer: It'll take ten minutes. - * Stinger: Bullshit ten minutes! This thing will be over in two minutes! Get on it! - * - * @param args - */ + * You can use this method to run goose from the command line + * to extract html from a bash script, or to just test its functionality: + * + * cd into the goose root + * mvn compile + * MAVEN_OPTS="-Xms256m -Xmx2000m"; mvn exec:java -Dexec.mainClass=com.gravity.goose.TalkToMeGoose -Dexec.args="http://techcrunch.com/2011/05/13/native-apps-or-web-apps-particle-code-wants-you-to-do-both/" -e -q > ~/Desktop/gooseresult.txt + * + * or if using sbt: + * + * cd into the goose root + * sbt + * > run http://www.thestar.com/news/insight/2013/04/26/spotting_tiny_gnatcatcher_can_put_a_spring_in_your_step.html + * + */ def main(args: Array[String]) { try { - val url: String = args(0) - val config: Configuration = new Configuration - config.enableImageFetching = false - val goose = new Goose(config) - val article = goose.extractContent(url) - println(article.cleanedArticleText) - } - catch { + println("URL to extract article from:") + val url: String = if (args.isEmpty) readLine() else args(0) + talk(url) + //talk2(url) + } catch { case e: Exception => { System.out.println("Make sure you pass in a valid URL: " + e.toString) + e.printStackTrace() } } } -} + def talk(url: String) { + val config: Configuration = new Configuration + config.enableImageFetching = false + config.imagemagickConvertPath = "/usr/bin/convert" + config.imagemagickIdentifyPath = "/usr/bin/identify" + config.localStoragePath = "/tmp/goose" + config.minBytesForImages = 4500 + val goose = new Goose(config) + val article = goose.extractContent(url) + println("TITLE: " + article.title) + println("DATE: " + article.publishDate) + println("TAGS: " + article.tags) + println("TEXT: " + article.cleanedArticleText) + println(article.topImage.imageSrc) + println(article.title) + } +} diff --git a/src/main/scala/com/gravity/goose/TalkToMeGooseAndCassandra.scala b/src/main/scala/com/gravity/goose/TalkToMeGooseAndCassandra.scala new file mode 100644 index 000000000..c69bd78ed --- /dev/null +++ b/src/main/scala/com/gravity/goose/TalkToMeGooseAndCassandra.scala @@ -0,0 +1,112 @@ +package com.gravity.goose + +import java.io._ +import scala.collection.JavaConversions._ +import scala.io.Source + + +object TalkToMeGooseAndCassandra { + /** + * Run Goose on a Cassandra keyspace column_family. Iterates over each key + * in the keyspace's column_family, uses that key as a URL to extract + * content from, and updates the row corresponding to that key with the + * content. + * mvn compile + * mvn exec:java -Dexec.mainClass=com.gravity.goose.TalkToMeGoose -Dexec.args="keyspace column_family" + */ + + def main(args: Array[String]) { + println("for cassandra uncomment this and dependencies in build.sbt and/or pom.xml") + /* +import com.netflix.astyanax.AstyanaxContext +import com.netflix.astyanax.Keyspace +import com.netflix.astyanax.MutationBatch +import com.netflix.astyanax.connectionpool.NodeDiscoveryType +import com.netflix.astyanax.connectionpool.OperationResult +import com.netflix.astyanax.connectionpool.impl.ConnectionPoolConfigurationImpl +import com.netflix.astyanax.connectionpool.impl.CountingConnectionPoolMonitor +import com.netflix.astyanax.impl.AstyanaxConfigurationImpl +import com.netflix.astyanax.model.ColumnFamily +import com.netflix.astyanax.model.Row +import com.netflix.astyanax.serializers.StringSerializer +import com.netflix.astyanax.serializers.StringSerializer +import com.netflix.astyanax.thrift.ThriftFamilyFactory +import com.netflix.astyanax.util.RangeBuilder + val keyspace_name: String = args(0) + val column_family_name: String = args(1) + val context = new AstyanaxContext.Builder() + .forCluster("Cluster") + .forKeyspace(keyspace_name) + .withAstyanaxConfiguration(new AstyanaxConfigurationImpl() + .setDiscoveryType(NodeDiscoveryType.RING_DESCRIBE) + .setTargetCassandraVersion("1.2") + ) + .withConnectionPoolConfiguration(new ConnectionPoolConfigurationImpl("MyConnectionPool") + .setPort(9160) + .setMaxConnsPerHost(1) + .setSeeds("127.0.0.1:9160") + ) + .withConnectionPoolMonitor(new CountingConnectionPoolMonitor()) + .buildKeyspace(ThriftFamilyFactory.getInstance()) + + context.start() + + val keyspace = context.getEntity() + val column_family = ColumnFamily.newColumnFamily( + column_family_name, + StringSerializer.get(), + StringSerializer.get()) + + val url_list = keyspace.prepareQuery(column_family) + .getAllRows() + .withColumnSlice(List("lastc", "lastv")) + .execute().getResult() + val mutation_batch = keyspace.prepareMutationBatch() + + var url = "" + var lastc = "" + var lastv = "" + var article = "" + var timestamp = "" + + val config: Configuration = new Configuration + config.enableImageFetching = false + val goose = new Goose(config) + + for (url_row <- url_list) { + url = url_row.getKey() + lastc = url_row.getColumns().getStringValue("lastc", "") + lastv = url_row.getColumns().getStringValue("lastv", "") + + try { + val gec = goose.extractContent(url) + article = gec.cleanedArticleText + timestamp = System.currentTimeMillis.toString + + if (article == "") { + article = gec.rawHtml + } + + if (article == lastc) { + mutation_batch.withRow(column_family, url) + .putColumn(timestamp, lastv) + } else { + lastv = "v" + (lastv.replace("v", "").toInt + 1) + mutation_batch.withRow(column_family, url) + .putColumn("lastc", article) + .putColumn("lastv", lastv) + .putColumn(lastv, article) + .putColumn(timestamp, lastv) + } + + mutation_batch.execute() + System.out.println("Extracted content from " + url) + } catch { + case e: Exception => { + System.out.println(url + " is not a valid URL: " + e.toString) + } + } + } + */ + } +} diff --git a/src/main/scala/com/gravity/goose/TalkToMeGooseAndGae.scala b/src/main/scala/com/gravity/goose/TalkToMeGooseAndGae.scala new file mode 100644 index 000000000..4ac7ddf07 --- /dev/null +++ b/src/main/scala/com/gravity/goose/TalkToMeGooseAndGae.scala @@ -0,0 +1,59 @@ +package com.gravity.goose + +import java.io._ +import scala.collection.JavaConversions._ +import scala.io.Source + +object TalkToMeGooseAndGae { + import com.google.appengine.tools.development.testing.LocalURLFetchServiceTestConfig + import com.google.appengine.tools.development.testing.LocalServiceTestHelper + val URLConfig: LocalURLFetchServiceTestConfig = new LocalURLFetchServiceTestConfig() + val Helper: LocalServiceTestHelper = new LocalServiceTestHelper(URLConfig) + + /** + * You can use this method to run goose from the command line + * to extract html from a bash script, or to just test its functionality: + * + * cd into the goose root + * mvn compile + * MAVEN_OPTS="-Xms256m -Xmx2000m"; mvn exec:java -Dexec.mainClass=com.gravity.goose.TalkToMeGoose -Dexec.args="http://techcrunch.com/2011/05/13/native-apps-or-web-apps-particle-code-wants-you-to-do-both/" -e -q > ~/Desktop/gooseresult.txt + * + * or if using sbt: + * + * cd into the goose root + * sbt + * > run http://www.thestar.com/news/insight/2013/04/26/spotting_tiny_gnatcatcher_can_put_a_spring_in_your_step.html + * + */ + def main(args: Array[String]) { + try { + println("URL to extract article from:") + val url: String = if (args.isEmpty) readLine() else args(0) + talk(url) + //talk2(url) + } catch { + case e: Exception => { + System.out.println("Make sure you pass in a valid URL: " + e.toString) + e.printStackTrace() + } + } finally { + Helper.tearDown() + } + } + + def talk(url: String) { + import org.apache.log4j.BasicConfigurator + Helper.setUp() + BasicConfigurator.configure(); + + val config: Configuration = new Configuration + config.enableImageFetching = false + + val goose = new Goose() + goose.setConfig(config) + + val article = goose.extractContent(url) + println("Tags: " + article.getTags()) + println(article.cleanedArticleText) + } +} diff --git a/src/main/scala/com/gravity/goose/UrlEndpoint.scala b/src/main/scala/com/gravity/goose/UrlEndpoint.scala new file mode 100644 index 000000000..a2ad44527 --- /dev/null +++ b/src/main/scala/com/gravity/goose/UrlEndpoint.scala @@ -0,0 +1,36 @@ +package com.gravity.goose + +import java.net.InetSocketAddress + +import org.simpleframework.http.Request +import org.simpleframework.http.Response +import org.simpleframework.http.core.Container +import org.simpleframework.transport.connect.SocketConnection + +object URLEndpoint { + + def main(args: Array[String]) { + val container = new URLEndpoint() + val connection = new SocketConnection(container) + val address = new InetSocketAddress(8890) + connection.connect(address) + } +} + +class URLEndpoint extends Container { + def handle(request: Request, response: Response) { + try { + val url = request.getQuery.get("url") + val body = response.getPrintStream + val responseString = JsonMain.getArticleAsJson(url) + body.println(responseString) + response.set("Content-Type", "application/json") + val time = System.currentTimeMillis() + response.setDate("Date", time) + response.setDate("Last-Modified", time) + body.close() + } catch { + case e: Exception => e.printStackTrace() + } + } +} diff --git a/src/main/scala/com/gravity/goose/cleaners/DocumentCleaner.scala b/src/main/scala/com/gravity/goose/cleaners/DocumentCleaner.scala index 27c541d2d..240001ff8 100644 --- a/src/main/scala/com/gravity/goose/cleaners/DocumentCleaner.scala +++ b/src/main/scala/com/gravity/goose/cleaners/DocumentCleaner.scala @@ -43,9 +43,12 @@ trait DocumentCleaner { def clean(article: Article): Document = { trace("Starting cleaning phase with DefaultDocumentCleaner") + var docToClean: Document = article.doc.clone + trace("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% BEFORE CLEAN %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") + trace(docToClean.html) + trace("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% BEFORE CLEAN %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") - var docToClean: Document = article.doc - docToClean = cleanEmTags(docToClean) + docToClean = cleanTextTags(docToClean) docToClean = removeDropCaps(docToClean) docToClean = removeScriptsAndStyles(docToClean) docToClean = cleanBadTags(docToClean) @@ -59,25 +62,32 @@ trait DocumentCleaner { // docToClean = convertDivsToParagraphs(docToClean, "div") // docToClean = convertDivsToParagraphs(docToClean, "span") - // docToClean = convertDivsToParagraphs(docToClean, "span") + trace("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% AFTER CLEAN %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") + trace(docToClean.html) + trace("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% AFTER CLEAN %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") docToClean } /** - * replaces tags with textnodes + * replaces various tags with textnodes */ - private def cleanEmTags(doc: Document): Document = { - val ems: Elements = doc.getElementsByTag("em") - + private def cleanTextTags(doc: Document): Document = { + var ems: Elements = doc.getElementsByTag("em") + ems ++= doc.getElementsByTag("strong") + ems ++= doc.getElementsByTag("b") + ems ++= doc.getElementsByTag("i") + ems ++= doc.getElementsByTag("strike") + ems ++= doc.getElementsByTag("del") + ems ++= doc.getElementsByTag("ins") for { node <- ems images: Elements = node.getElementsByTag("img") if (images.size == 0) } { - val tn: TextNode = new TextNode(node.text, doc.baseUri) + val tn: TextNode = new TextNode(node.text.trim+" ", doc.baseUri) node.replaceWith(tn) } - trace(ems.size + " EM tags modified") + trace(ems.size + " EM/strong/b/i/strike/del/ins tags modified") doc } @@ -129,40 +139,46 @@ trait DocumentCleaner { } private def cleanBadTags(doc: Document): Document = { - val children: Elements = doc.body.children - val naughtyList: Elements = children.select(queryNaughtyIDs) - trace(naughtyList.size + " naughty ID elements found") + /* jsoup 1.7.3 can return pages without a body. */ + if (doc.body != null) { + val children: Elements = doc.body.children + val naughtyList: Elements = children.select(queryNaughtyIDs) + trace(naughtyList.size + " naughty ID elements found") - import scala.collection.JavaConversions._ - for (node <- naughtyList) { - trace("Removing node with id: " + node.id) - removeNode(node) - } + import scala.collection.JavaConversions._ + for (node <- naughtyList) { + trace("Removing node with id: " + node.id) + removeNode(node) + } - val naughtyList2: Elements = children.select(queryNaughtyIDs) - trace(naughtyList2.size + " naughty ID elements found after removal") + val naughtyList2: Elements = children.select(queryNaughtyIDs) + trace(naughtyList2.size + " naughty ID elements found after removal") - val naughtyClasses: Elements = children.select(queryNaughtyClasses) + val naughtyClasses: Elements = children.select(queryNaughtyClasses) - trace(naughtyClasses.size + " naughty CLASS elements found") + trace(naughtyClasses.size + " naughty CLASS elements found") - for (node <- naughtyClasses) { - trace("Removing node with class: " + node.className) - removeNode(node) - } + for (node <- naughtyClasses) { + trace("Removing node with class: " + node.className) + removeNode(node) + } - val naughtyClasses2: Elements = children.select(queryNaughtyClasses) - trace(naughtyClasses2.size + " naughty CLASS elements found after removal") + val naughtyClasses2: Elements = children.select(queryNaughtyClasses) + trace(naughtyClasses2.size + " naughty CLASS elements found after removal") - val naughtyList5: Elements = children.select(queryNaughtyNames) + val naughtyList5: Elements = children.select(queryNaughtyNames) - trace(naughtyList5.size + " naughty Name elements found") + trace(naughtyList5.size + " naughty Name elements found") - for (node <- naughtyList5) { + for (node <- naughtyList5) { - trace("Removing node with class: " + node.attr("class") + " id: " + node.id + " name: " + node.attr("name")) + trace("Removing node with class: " + node.attr("class") + " id: " + node.id + " name: " + node.attr("name")) - removeNode(node) + removeNode(node) + } + } + else { + trace("Document has no body.") } doc } @@ -379,10 +395,9 @@ object DocumentCleaner extends Logging { var sb: StringBuilder = new StringBuilder // create negative elements - sb.append("^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar|comment|PopularQuestions|contact|foot|footer|Footer|footnote|cnn_strycaptiontxt|links|meta$|scroll|shoutbox|sponsor") + sb.append("^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar|comment(?!ed)|PopularQuestions|contact|foot|footer|Footer|footnote|cnn_strycaptiontxt|links|meta$|scroll(?!able)|shoutbox|sponsor") sb.append("|tags|socialnetworking|socialNetworking|cnnStryHghLght|cnn_stryspcvbx|^inset$|pagetools|post-attributes|welcome_form|contentTools2|the_answers|remember-tool-tip") - sb.append("|communitypromo|runaroundLeft|subscribe|vcard|articleheadings|date|^print$|popup|author-dropdown|tools|socialtools|byline|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text") - + sb.append("|communitypromo|promo_holder|runaroundLeft|subscribe|vcard|articleheadings|date|^print$|popup|author-dropdown|tools|socialtools|byline|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text") /** * this regex is used to remove undesirable nodes from our doc * indicate that something maybe isn't content but more of a comment, footer or some other undesirable node diff --git a/src/main/scala/com/gravity/goose/extractors/ContentExtractor.scala b/src/main/scala/com/gravity/goose/extractors/ContentExtractor.scala index 51838e335..652f99725 100644 --- a/src/main/scala/com/gravity/goose/extractors/ContentExtractor.scala +++ b/src/main/scala/com/gravity/goose/extractors/ContentExtractor.scala @@ -22,11 +22,14 @@ import com.gravity.goose.text._ import com.gravity.goose.utils.Logging import java.net.URL import java.util.ArrayList +import java.util.Date import scala.collection.mutable +import scala.collection.immutable import scala.collection.JavaConversions._ import org.jsoup.nodes.{Attributes, Element, Document} import org.jsoup.select._ - +import com.gravity.goose.Language._ +import scala.math._ /** * Created by Jim Plush * User: jim @@ -53,7 +56,7 @@ trait ContentExtractor { val SPACE_SPLITTER: StringSplitter = new StringSplitter(" ") val NO_STRINGS = Set.empty[String] val A_REL_TAG_SELECTOR: String = "a[rel=tag], a[href*=/tag/]" - val TOP_NODE_TAGS = new TagsEvaluator(Set("p", "td", "pre")) + val TOP_NODE_TAGS = new TagsEvaluator(Set("p", "td", "pre", "strong", "li", "code")) def getTitle(article: Article): String = { var title: String = string.empty @@ -107,21 +110,23 @@ trait ContentExtractor { * @return */ def doTitleSplits(title: String, splitter: StringSplitter): String = { - var largetTextLen: Int = 0 - var largeTextIndex: Int = 0 + var largestTextLen: Int = 0 + var largestTextIndex: Int = 0 val titlePieces: Array[String] = splitter.split(title) - var i: Int = 0 - while (i < titlePieces.length) { - - val current: String = titlePieces(i) - if (current.length > largetTextLen) { - largetTextLen = current.length - largeTextIndex = i + if (titlePieces.length > 0) { + var i: Int = 0 + while (i < titlePieces.length) { + val current: String = titlePieces(i) + if (current.length > largestTextLen) { + largestTextLen = current.length + largestTextIndex = i + } + i += 1 } - i += 1 + TITLE_REPLACEMENTS.replaceAll(titlePieces(largestTextIndex)).trim } - TITLE_REPLACEMENTS.replaceAll(titlePieces(largeTextIndex)).trim + else null } private def getMetaContent(doc: Document, metaName: String): String = { @@ -137,7 +142,19 @@ trait ContentExtractor { * if the article has meta description set in the source, use that */ def getMetaDescription(article: Article): String = { - getMetaContent(article.doc, "meta[name=description]") + var desc = article.doc.select("meta[name=description]").attr("content") + if (desc.isEmpty) { + desc = article.doc.select("meta[property=og:description]").attr("content") + if (desc.isEmpty) { + desc = article.doc.select("meta[name=twitter:description]").attr("content") + } + } + + if (desc.nonEmpty) { + desc.trim + } else { + string.empty + } } /** @@ -152,9 +169,22 @@ trait ContentExtractor { * if the article has meta canonical link set in the url */ def getCanonicalLink(article: Article): String = { - val meta = article.doc.select("link[rel=canonical]") - if (meta.size() > 0) { - val href = Option(meta.first().attr("href")).getOrElse("").trim + var url = article.doc.select("link[rel=canonical]").attr("abs:href") + trace(logPrefix + " base uri: " + article.doc.baseUri) + trace(logPrefix + " canonical link: " + url) + + if (url.isEmpty) { + url = article.doc.select("meta[property=og:url]").attr("abs:content") + + trace(logPrefix + " canonical link meta og: " + url) + if (url.isEmpty) { + url = article.doc.select("meta[name=twitter:url]").attr("abs:content") + + trace(logPrefix + " canonical link meta twitter: " + url) + } + } + if (url.nonEmpty) { + val href = url.trim if (href.nonEmpty) href else article.finalUrl } else { article.finalUrl @@ -179,6 +209,69 @@ trait ContentExtractor { tags.toSet } + def getDateFromURL(url: String): Date = { + val path = new URL(url).getPath + + var year: Integer = -1; + var yearCounter: Integer = -1; + var month: Integer = -1; + var monthCounter: Integer = -1; + var day: Integer = -1; + var done: Boolean = false + val strs = path.split("/"); + for ((str, counter) <- strs.zipWithIndex) { + if (!done) { + if (str.length() == 4 && yearCounter < 0) { + try { + year = Integer.parseInt(str); + if (year < 1970 || year > 3000) { + year = -1; + } else { + trace(logPrefix + " found year: " + year) + yearCounter = counter; + } + } catch { + case _ : java.lang.NumberFormatException => None + } + } else if (str.length() == 2) { + if (monthCounter < 0 && counter == yearCounter + 1) { + try { + month = Integer.parseInt(str); + if (month < 1 || month > 12) { + month = -1; + } else { + trace(logPrefix + " found month: " + month) + monthCounter = counter; + } + } catch { + case _ : java.lang.NumberFormatException => None + } + } else if (counter == monthCounter + 1) { + try { + day = Integer.parseInt(str); + if (day < 1 || day > 31) { + day = -1; + } else { + trace(logPrefix + " found day: " + day) + done = true + } + } catch { + case _ : java.lang.NumberFormatException => None + } + } + } + } + } + + // should be converted to use jodatime or something, because java's date is terrible + if (year < 0) return null; + year = year - 1900 // date constructor takes year - 1900 + if (month < 1) return new Date(year, 0, 1) + month = month - 1 // date constructor dates month in 0 - 11 + if (day < 1) return new Date(year, month, 1) + return new Date(year, month, day) + } + /** * we're going to start looking for where the clusters of paragraphs are. We'll score a cluster based on the number of stopwords * and the number of consecutive paragraphs together, which should form the cluster of text that this node is around @@ -188,20 +281,24 @@ trait ContentExtractor { * @return */ - def calculateBestNodeBasedOnClustering(article: Article): Option[Element] = { + //def calculateBestNodeBasedOnClustering(article: Article, language: Language): Option[Element] = { + def calculateBestNodeBasedOnClustering(article: Article, + lang:String): Option[Element] = { trace(logPrefix + "Starting to calculate TopNode") - val doc = article.doc + val doc = article.doc.clone var topNode: Element = null val nodesToCheck = Collector.collect(TOP_NODE_TAGS, doc) var startingBoost: Double = 1.0 var cnt: Int = 0 var i: Int = 0 - val parentNodes = mutable.HashSet[Element]() + val parentNodes = mutable.LinkedHashSet[Element]() val nodesWithText = mutable.Buffer[Element]() for (node <- nodesToCheck) { val nodeText: String = node.text - val wordStats: WordStats = StopWords.getStopWordCount(nodeText) +// val wordStats: WordStats = StopWords.getStopWordCount(nodeText, language) + val wordStats: WordStats = StopWords.getStopWordCount(nodeText, lang) val highLinkDensity: Boolean = isHighLinkDensity(node) + trace("Candidate: " + node.tagName() + " score: " + wordStats + " d:" + highLinkDensity + " text:" + nodeText) if (wordStats.getStopWordCount > 2 && !highLinkDensity) { nodesWithText.add(node) } @@ -214,7 +311,8 @@ trait ContentExtractor { for (node <- nodesWithText) { var boostScore: Float = 0 - if (isOkToBoost(node)) { +// if (isOkToBoost(node, language)) { + if (isOkToBoost(node, lang)) { if (cnt >= 0) { boostScore = ((1.0 / startingBoost) * 50).asInstanceOf[Float] startingBoost += 1 @@ -223,18 +321,19 @@ trait ContentExtractor { if (numberOfNodes > 15) { if ((numberOfNodes - i) <= bottomNodesForNegativeScore) { val booster: Float = bottomNodesForNegativeScore.asInstanceOf[Float] - (numberOfNodes - i).asInstanceOf[Float] - boostScore = -math.pow(booster, 2.asInstanceOf[Float]).asInstanceOf[Float] - val negscore: Float = math.abs(boostScore) + negativeScoring + boostScore = -pow(booster, 2.asInstanceOf[Float]).asInstanceOf[Float] + val negscore: Float = abs(boostScore) + negativeScoring if (negscore > 40) { boostScore = 5 } } } - trace(logPrefix + "Location Boost Score: " + boostScore + " on interation: " + i + "' id='" + node.parent.id + "' class='" + node.parent.attr("class")) + trace(logPrefix + "Location Boost Score: " + boostScore + " on interation: " + i + " tag='"+ node.tagName +"' id='" + node.parent.id + "' class='" + node.parent.attr("class")) val nodeText: String = node.text - val wordStats: WordStats = StopWords.getStopWordCount(nodeText) +// val wordStats: WordStats = StopWords.getStopWordCount(nodeText, language) + val wordStats: WordStats = StopWords.getStopWordCount(nodeText, lang) val upscore: Int = (wordStats.getStopWordCount + boostScore).asInstanceOf[Int] updateScore(node.parent, upscore) updateScore(node.parent.parent, upscore / 2) @@ -265,6 +364,11 @@ trait ContentExtractor { } } printTraceLog(topNode) + if(topNode != null && getScore(topNode) < 20) + { + debug("TopNode score is too small!") + return None + } if (topNode == null) None else Some(topNode) } @@ -289,7 +393,8 @@ trait ContentExtractor { * @param node * @return */ - private def isOkToBoost(node: Element): Boolean = { +// private def isOkToBoost(node: Element, language: Language): Boolean = { + private def isOkToBoost(node: Element, lang: String): Boolean = { val para = "p" var stepsAway: Int = 0 val minimumStopWordCount = 5 @@ -297,15 +402,16 @@ trait ContentExtractor { walkSiblings(node) { currentNode => { - if (currentNode.tagName == para) { + if (currentNode.tagName == para || currentNode.tagName == "strong") { if (stepsAway >= maxStepsAwayFromNode) { trace(logPrefix + "Next paragraph is too far away, not boosting") return false } val paraText: String = currentNode.text - val wordStats: WordStats = StopWords.getStopWordCount(paraText) +// val wordStats: WordStats = StopWords.getStopWordCount(paraText, language) + val wordStats: WordStats = StopWords.getStopWordCount(paraText, lang) if (wordStats.getStopWordCount > minimumStopWordCount) { - trace(logPrefix + "We're gonna boost this node, seems contenty") + trace(logPrefix + "We're gonna boost this node, seems contenty " + debugNode(node)) return true } stepsAway += 1 @@ -326,8 +432,11 @@ trait ContentExtractor { * @param e * @return */ - private def isHighLinkDensity(e: Element): Boolean = { +// private def isHighLinkDensity(e: Element, limit: Double = 0.1): Boolean = { + private def isHighLinkDensity(e: Element, limit: Double = 1.0): Boolean = { + val links: Elements = e.getElementsByTag("a") + links.addAll(e.getElementsByAttribute("onclick")) if (links.size == 0) { return false } @@ -345,9 +454,9 @@ trait ContentExtractor { val linkDivisor: Float = numberOfLinkWords / numberOfWords val score: Float = linkDivisor * numberOfLinks - trace(logPrefix + "Calulated link density score as: " + score + " for node: " + getShortText(e.text, 50)) + trace(logPrefix + "Calculated link density score as: " + score + " for node: " + getShortText(e.text, 50)) - if (score > 1) { + if (score >= limit) { return true } false @@ -427,8 +536,14 @@ trait ContentExtractor { def extractVideos(node: Element): List[Element] = { val candidates: ArrayList[Element] = new ArrayList[Element] val goodMovies = mutable.Buffer[Element]() + val youtubeStr = "youtube" val vimdeoStr = "vimeo" + val bliptvStr = "blip" + val flickrStr = "flickr" + val veohStr = "veoh" + val dailymotionStr = "dailymotion" + try { node.parent.getElementsByTag("embed").foreach(candidates.add(_)) node.parent.getElementsByTag("object").foreach(candidates.add(_)) @@ -439,7 +554,14 @@ trait ContentExtractor { val attrs: Attributes = el.attributes() for (a <- attrs) { try { - if ((a.getValue.contains(youtubeStr) || a.getValue.contains(vimdeoStr)) && (a.getKey == "src")) { + if (( + a.getValue.contains(youtubeStr) || + a.getValue.contains(bliptvStr) || + a.getValue.contains(flickrStr) || + a.getValue.contains(veohStr) || + a.getValue.contains(dailymotionStr) || + a.getValue.contains(vimdeoStr) + ) && (a.getKey == "src")) { trace(logPrefix + "This page has a video!: " + a.getValue) goodMovies += el } @@ -464,20 +586,45 @@ trait ContentExtractor { goodMovies.toList } + /** + * pulls out links we like + * + * @return + */ + def extractLinks(node: Element): List[Map[String, String]] = { + val goodLinks = mutable.MutableList[Map[String, String]]() + + val candidates = node.parent.select("a[href]").filter(el => el.attr("href") != "#" && !el.attr("abs:href").trim.isEmpty).map(el => goodLinks += immutable.Map("url" -> el.attr("abs:href"), "text" -> el.text)) + + trace(logPrefix + "extractLinks: Extracted links. Found: " + candidates.size) + + goodLinks.toList + } + def isTableTagAndNoParagraphsExist(e: Element): Boolean = { - val subParagraphs: Elements = e.getElementsByTag("p") + val subParagraphs: Elements = getChildParagraphs(e) for (p <- subParagraphs) { if (p.text.length < 25) { p.remove() } } + val subParagraphs2: Elements = e.getElementsByTag("p") - if (subParagraphs2.size == 0 && !(e.tagName == "td")) { - trace("Removing node because it doesn't have any paragraphs") - true + //val subParagraphs2: Elements = getChildParagraphs(e) + if (subParagraphs2.size == 0 && e.tagName != "td") { + if (e.tagName == "ul" || e.tagName == "ol") { + val linkTextLength = e.getElementsByTag("a").map(_.text.length).sum + val elementTextLength = e.text.length + if (elementTextLength > 0 && (linkTextLength.toFloat / elementTextLength) < 0.5) { + return false // less than half of the list is links, so keep this + } + trace("List failed link density test: " + linkTextLength + " " + elementTextLength + " " + getShortText(e.text, 50)) + } + trace("Removing node because it doesn't have any paragraphs " + e.tagName + " " + e.attr("class")) + return true } else { - false + return false } } @@ -488,13 +635,16 @@ trait ContentExtractor { * @param targetNode * @return */ - def postExtractionCleanup(targetNode: Element): Element = { +// def postExtractionCleanup(targetNode: Element, language: Language): Element = { + def postExtractionCleanup(targetNode: Element, lang: String): Element = { trace(logPrefix + "Starting cleanup Node") - val node = addSiblings(targetNode) +// val node = addSiblings(targetNode, language) + val node = addSiblings(targetNode, lang) for { e <- node.children - if (e.tagName != "p") + if (e.tagName != "p" || isHighLinkDensity(e)) + //if (e.tagName != "p" && e.tagName != "strong") } { trace(logPrefix + "CLEANUP NODE: " + e.id + " class: " + e.attr("class")) if (isHighLinkDensity(e) || isTableTagAndNoParagraphsExist(e) || !isNodeScoreThreshholdMet(node, e)) { @@ -505,6 +655,7 @@ trait ContentExtractor { } } } + trace(logPrefix + "Finished cleanup Node") node } @@ -517,7 +668,7 @@ trait ContentExtractor { trace(logPrefix + "topNodeScore: " + topNodeScore + " currentNodeScore: " + currentNodeScore + " threshold: " + thresholdScore) if ((currentNodeScore < thresholdScore) && e.tagName != "td") { - trace(logPrefix + "Removing node due to low threshold score") + trace(logPrefix + "Removing node due to low threshold score " + debugNode(e)) false } else { trace(logPrefix + "Not removing TD node") @@ -525,20 +676,30 @@ trait ContentExtractor { } } + def getChildParagraphs(e: Element): Elements = + { + val potentialParagraphs: Elements = e.getElementsByTag("p") + potentialParagraphs.addAll(e.getElementsByTag("strong")) + potentialParagraphs + } + /** * adds any siblings that may have a decent score to this node * * @param currentSibling * @return */ - def getSiblingContent(currentSibling: Element, baselineScoreForSiblingParagraphs: Int): Option[String] = { +// def getSiblingContent(currentSibling: Element, baselineScoreForSiblingParagraphs: Int, language: Language): Option[String] = { + def getSiblingContent(currentSibling: Element, + baselineScoreForSiblingParagraphs: Int, + lang: String): Option[String] = { - if (currentSibling.tagName == "p" && currentSibling.text.length() > 0) { + if ((currentSibling.tagName == "p" || currentSibling.tagName == "strong") && currentSibling.text.length() > 0) { Some(currentSibling.outerHtml) } else { - val potentialParagraphs: Elements = currentSibling.getElementsByTag("p") + val potentialParagraphs: Elements = getChildParagraphs(currentSibling) if (potentialParagraphs.first == null) { None } else { @@ -546,7 +707,8 @@ trait ContentExtractor { Some((for { firstParagraph <- potentialParagraphs if (firstParagraph.text.length() > 0) - wordStats: WordStats = StopWords.getStopWordCount(firstParagraph.text) +// wordStats: WordStats = StopWords.getStopWordCount(firstParagraph.text, language) + wordStats: WordStats = StopWords.getStopWordCount(firstParagraph.text, lang) paragraphScore: Int = wordStats.getStopWordCount siblingBaseLineScore: Double = .30 if ((baselineScoreForSiblingParagraphs * siblingBaseLineScore).toDouble < paragraphScore) @@ -575,14 +737,17 @@ trait ContentExtractor { b } - private def addSiblings(topNode: Element): Element = { +// private def addSiblings(topNode: Element, language: Language): Element = { + private def addSiblings(topNode: Element, lang: String): Element = { trace(logPrefix + "Starting to add siblings") - val baselineScoreForSiblingParagraphs: Int = getBaselineScoreForSiblings(topNode) +// val baselineScoreForSiblingParagraphs: Int = getBaselineScoreForSiblings(topNode, language) + val baselineScoreForSiblingParagraphs: Int = getBaselineScoreForSiblings(topNode, lang) val results = walkSiblings(topNode) { currentNode => { - getSiblingContent(currentNode, baselineScoreForSiblingParagraphs) +// getSiblingContent(currentNode, baselineScoreForSiblingParagraphs, language) + getSiblingContent(currentNode, baselineScoreForSiblingParagraphs, lang) } }.reverse.flatMap(itm => itm) @@ -599,15 +764,17 @@ trait ContentExtractor { * @param topNode * @return */ - private def getBaselineScoreForSiblings(topNode: Element): Int = { +// private def getBaselineScoreForSiblings(topNode: Element, language: Language): Int = { + private def getBaselineScoreForSiblings(topNode: Element, lang: String): Int = { var base: Int = 100000 var numberOfParagraphs: Int = 0 var scoreOfParagraphs: Int = 0 - val nodesToCheck: Elements = topNode.getElementsByTag("p") + val nodesToCheck: Elements = getChildParagraphs(topNode) for (node <- nodesToCheck) { val nodeText: String = node.text - val wordStats: WordStats = StopWords.getStopWordCount(nodeText) +// val wordStats: WordStats = StopWords.getStopWordCount(nodeText, language) + val wordStats: WordStats = StopWords.getStopWordCount(nodeText, lang) val highLinkDensity: Boolean = isHighLinkDensity(node) if (wordStats.getStopWordCount > 2 && !highLinkDensity) { numberOfParagraphs += 1; @@ -625,7 +792,9 @@ trait ContentExtractor { private def debugNode(e: Element): String = { val sb: StringBuilder = new StringBuilder - sb.append("GravityScore: '") + sb.append("'tag '") + sb.append(e.tagName) + sb.append("' GravityScore: '") sb.append(e.attr("gravityScore")) sb.append("' paraNodeCount: '") sb.append(e.attr("gravityNodes")) @@ -633,6 +802,7 @@ trait ContentExtractor { sb.append(e.id) sb.append("' className: '") sb.append(e.attr("class")) + sb.append("'") sb.toString() } -} \ No newline at end of file +} diff --git a/src/main/scala/com/gravity/goose/extractors/OpenGraphDataExtractor.scala b/src/main/scala/com/gravity/goose/extractors/OpenGraphDataExtractor.scala new file mode 100644 index 000000000..616e50551 --- /dev/null +++ b/src/main/scala/com/gravity/goose/extractors/OpenGraphDataExtractor.scala @@ -0,0 +1,51 @@ +/** +Copyright [2014] Robby Pond + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package com.gravity.goose.extractors + +import org.jsoup.nodes.Element + +import scala.collection.JavaConversions._ +import com.gravity.goose.opengraph.OpenGraphData +import org.joda.time.format.ISODateTimeFormat + +class OpenGraphDataExtractor extends Extractor[OpenGraphData] { + + def extract(rootElement: Element): OpenGraphData = { + val openGraphData: OpenGraphData = new OpenGraphData + val dateParser = ISODateTimeFormat.dateTimeParser + for(el <- rootElement.select("meta")) { + val property = el.attr("property") + val value = el.attr("content") + property match { + case "og:title" => openGraphData.title = value + case "og:site_name" => openGraphData.siteName = value + case "og:url" => openGraphData.url = value + case "og:description" => openGraphData.description = value + case "og:image" => openGraphData.image = value + case "og:type" => openGraphData.ogType = value + case "og:locale" => openGraphData.locale = value + case "article:author" => openGraphData.author = value + case "article:publisher" => openGraphData.publisher = value + case "article:section" => openGraphData.section = value + case "article:tag" => openGraphData.tags ++= value.split(",").map(_.trim) + case "article:published_time" => openGraphData.publishedTime = dateParser.parseDateTime(value) + case "article:modified_time" => openGraphData.modifiedTime = dateParser.parseDateTime(value) + case _ => () + } + } + openGraphData + } +} diff --git a/src/main/scala/com/gravity/goose/extractors/PublishDateExtractor.scala b/src/main/scala/com/gravity/goose/extractors/PublishDateExtractor.scala index aa8e1ae04..1ac535724 100644 --- a/src/main/scala/com/gravity/goose/extractors/PublishDateExtractor.scala +++ b/src/main/scala/com/gravity/goose/extractors/PublishDateExtractor.scala @@ -18,10 +18,10 @@ package com.gravity.goose.extractors import org.jsoup.nodes.Element -import java.util.Date +import com.github.nscala_time.time.Imports._ /** -* Implement this class to extract the {@link Date} of when this article was published. +* Implement this class to extract the {@link DateTime} of when this article was published. */ /** * Created by IntelliJ IDEA. @@ -29,15 +29,15 @@ import java.util.Date * Date: 5/19/11 * Time: 2:50 PM */ -abstract class PublishDateExtractor extends Extractor[Date] { +abstract class PublishDateExtractor extends Extractor[DateTime] { /** - * Intended to search the DOM and identify the {@link Date} of when this article was published. - *

This will be called by the {@link com.jimplush.goose.ContentExtractor#extractContent(String)} method and will be passed to {@link com.jimplush.goose.Article#setPublishDate(java.util.Date)}

+ * Intended to search the DOM and identify the {@link DateTime} of when this article was published. + *

This will be called by the {@link com.jimplush.goose.ContentExtractor#extractContent(String)} method and will be passed to {@link com.jimplush.goose.Article#setPublishDate(org.joda.time.DateTime)}

* * @param rootElement passed in from the {@link com.jimplush.goose.ContentExtractor} after the article has been parsed - * @return {@link Date} of when this particular article was published or null if no date could be found. + * @return {@link DateTime} of when this particular article was published or null if no date could be found. */ - def extract(rootElement: Element): Date + def extract(rootElement: Element): DateTime } diff --git a/src/main/scala/com/gravity/goose/images/Image.scala b/src/main/scala/com/gravity/goose/images/Image.scala index 0b2d425e5..4e734f325 100644 --- a/src/main/scala/com/gravity/goose/images/Image.scala +++ b/src/main/scala/com/gravity/goose/images/Image.scala @@ -36,12 +36,15 @@ class Image { /** * holds the src of the image */ - var imageSrc: String = ""; + var imageSrc: String = "" + + /** Raw score of image. */ + var imageScore: Float = 0 /** * how confident are we in this image extraction? the most images generally the less confident */ - var confidenceScore: Double = 0.0; + var confidenceScore: Double = 0.0 /** * Height of the image in pixels @@ -56,16 +59,28 @@ class Image { /** * what kind of image extraction was used for this? bestGuess, linkTag, openGraph tags? */ - var imageExtractionType: String = "NA"; + var imageExtractionType: String = "NA" /** * stores how many bytes this image is. */ - var bytes: Long = 0; + var bytes: Long = 0 def getImageSrc = { imageSrc } + + + + override def toString = + s"""Image{ + imageSrc=$imageSrc, + imageScore=$imageScore, + height=$height, + width=$width, + imageExtractionType=$imageExtractionType +}""" + } \ No newline at end of file diff --git a/src/main/scala/com/gravity/goose/images/ImageExtractor.scala b/src/main/scala/com/gravity/goose/images/ImageExtractor.scala index c73c8a4e0..f45cfca0c 100644 --- a/src/main/scala/com/gravity/goose/images/ImageExtractor.scala +++ b/src/main/scala/com/gravity/goose/images/ImageExtractor.scala @@ -33,6 +33,8 @@ trait ImageExtractor extends CanLog { def getBestImage(doc: Document, topNode: Element): Image + def getAllImages(topNode: Element, parentDepthLevel: Int = 0, siblingDepthLevel: Int = 0): List[Image] + def logPrefix: String = ImageExtractor.loggingPrefix def critical(msg: String, refs: Any*) { diff --git a/src/main/scala/com/gravity/goose/images/ImageUtils.scala b/src/main/scala/com/gravity/goose/images/ImageUtils.scala index f4099b9e0..cb76266bf 100644 --- a/src/main/scala/com/gravity/goose/images/ImageUtils.scala +++ b/src/main/scala/com/gravity/goose/images/ImageUtils.scala @@ -23,11 +23,13 @@ package com.gravity.goose.images * Date: 8/18/11 */ +import javax.activation.MimetypesFileTypeMap import javax.imageio.ImageIO import java.awt.color.CMMException import java.awt.image.BufferedImage import com.gravity.goose.utils.{URLHelper, Logging} import org.apache.http.client.HttpClient +import org.apache.http.params.HttpConnectionParams import org.apache.http.HttpEntity import org.apache.http.protocol.{BasicHttpContext, HttpContext} import org.apache.http.client.protocol.ClientContext @@ -48,11 +50,11 @@ object ImageUtils extends Logging { * User: Jim Plush * gets the image dimensions for an image file, pass in the path to the image who's dimensions you want to get * this will use imageMagick since the Java IO and imaging shit SUCKS for getting mime types and file info for jpg and png files - * + * raisercostin: this one uses the executable imageMagick. In 2014 let's give again a chance to the java one :D * @param filePath * @return */ - def getImageDimensions(identifyProgram: String, filePath: String): ImageDetails = { + def getImageDimensions2(identifyProgram: String, filePath: String): ImageDetails = { val imageInfo = execToString(Array(identifyProgram, filePath)) val imageDetails: ImageDetails = new ImageDetails if (imageInfo == null || imageInfo.contains("no decode delegate for this image format")) { @@ -79,6 +81,25 @@ object ImageUtils extends Logging { imageDetails.setHeight(height) imageDetails } + /** + * User: Jim Plush + * gets the image dimensions for an image file, pass in the path to the image who's dimensions you want to get + * this will use imageMagick since the Java IO and imaging shit SUCKS for getting mime types and file info for jpg and png files + * + * @param filePath + * @return + */ + def getImageDimensions(identifyProgram: String, filePath: String): ImageDetails = { + + val (mimeType, width, height) = getImageDimensionsJava(filePath) + val imageDetails: ImageDetails = new ImageDetails + + imageDetails.setMimeType(mimeType) + imageDetails.setWidth(width) + imageDetails.setHeight(height) + + imageDetails + } /** * gets the image dimensions for an image file, pass in the path to the image who's dimensions you want to get, uses the built in java commands @@ -86,15 +107,13 @@ object ImageUtils extends Logging { * @param filePath * @return */ - def getImageDimensionsJava(filePath: String): HashMap[String, Integer] = { + def getImageDimensionsJava(filePath: String): (String, Integer,Integer) = { var image: BufferedImage = null try { val f: File = new File(filePath) image = ImageIO.read(f) - val results: HashMap[String, Integer] = new HashMap[String, Integer] - results.put("height", image.getHeight) - results.put("width", image.getWidth) - results + val mimeType : String = new MimetypesFileTypeMap().getContentType(f) + (mimeType, image.getWidth, image.getHeight) } catch { case e: CMMException => { @@ -205,7 +224,7 @@ object ImageUtils extends Logging { case "png" => ".png" case "jpg" => ".jpg" case "jpeg" => ".jpg" - case ".gif" => ".gif" + case "gif" => ".gif" case _ => "NA" } mimeType @@ -274,7 +293,13 @@ object ImageUtils extends Logging { val localContext: HttpContext = new BasicHttpContext localContext.setAttribute(ClientContext.COOKIE_STORE, HtmlFetcher.emptyCookieStore) val response = try { - config.getHtmlFetcher.getHttpClient.execute(httpget, localContext) + val httpClient = config.getHtmlFetcher.getHttpClient // this doesn't use the passed in httpClient, I'm not sure why... + val params = httpClient.getParams + + HttpConnectionParams.setConnectionTimeout(params, config.getImageConnectionTimeout()) + HttpConnectionParams.setSoTimeout(params, config.getImageSocketTimeout()) + + httpClient.execute(httpget, localContext) } catch { case ex: Exception => throw new ImageFetchException(imageSrc, ex) diff --git a/src/main/scala/com/gravity/goose/images/StandardImageExtractor.scala b/src/main/scala/com/gravity/goose/images/StandardImageExtractor.scala index d19eeef58..bc486e01b 100644 --- a/src/main/scala/com/gravity/goose/images/StandardImageExtractor.scala +++ b/src/main/scala/com/gravity/goose/images/StandardImageExtractor.scala @@ -40,7 +40,6 @@ import org.apache.http.client.methods.HttpGet * Date: 8/18/11 */ -case class DepthTraversal(node: Element, parentDepth: Int, siblingDepth: Int) /** * This image extractor will attempt to find the best image nearest the article. @@ -77,7 +76,7 @@ class StandardImageExtractor(httpClient: HttpClient, article: Article, config: C var sb: StringBuilder = new StringBuilder // create negative elements - sb.append(".html|.gif|.ico|button|twitter.jpg|facebook.jpg|ap_buy_photo|digg.jpg|digg.png|delicious.png|facebook.png|reddit.jpg|doubleclick|diggthis|diggThis|adserver|/ads/|ec.atdmt.com") + sb.append(".html|.ico|button|twitter.jpg|facebook.jpg|ap_buy_photo|digg.jpg|digg.png|delicious.png|facebook.png|reddit.jpg|doubleclick|diggthis|diggThis|adserver|/ads/|ec.atdmt.com") sb.append("|mediaplex.com|adsatt|view.atdmt") matchBadImageNames = Pattern.compile(sb.toString()).matcher(string.empty) @@ -112,6 +111,8 @@ class StandardImageExtractor(httpClient: HttpClient, article: Article, config: C image } + def getAllImages(topNode: Element, parentDepthLevel: Int = 0, siblingDepthLevel: Int = 0): List[Image] = List[Image]() + private def checkForMetaTag: Boolean = { if (this.checkForLinkTag) { return true @@ -666,4 +667,4 @@ class StandardImageExtractor(httpClient: HttpClient, article: Article, config: C } -} \ No newline at end of file +} diff --git a/src/main/scala/com/gravity/goose/images/UpgradedImageIExtractor.scala b/src/main/scala/com/gravity/goose/images/UpgradedImageIExtractor.scala index 787e9c0c1..aedcf6f63 100644 --- a/src/main/scala/com/gravity/goose/images/UpgradedImageIExtractor.scala +++ b/src/main/scala/com/gravity/goose/images/UpgradedImageIExtractor.scala @@ -1,55 +1,57 @@ package com.gravity.goose.images import org.apache.http.client.HttpClient -import com.gravity.goose.{Configuration, Article} -import org.jsoup.nodes.{Element, Document} -import java.util.regex.{Pattern, Matcher} +import com.gravity.goose.{ Configuration, Article } +import org.jsoup.nodes.{ Element, Document } +import java.util.regex.{ Pattern, Matcher } import com.gravity.goose.text.string -import java.net.{MalformedURLException, URL} +import java.net.{ MalformedURLException, URL } import org.jsoup.select.Elements import scala.collection.JavaConversions._ import java.util.ArrayList -import collection.mutable.{ListBuffer, HashMap} +import collection.mutable.{ ListBuffer, HashMap } import com.gravity.goose.utils.FileHelper import io.Source /** -* Created by Jim Plush -* User: jim -* Date: 9/22/11 -*/ + * Created by Jim Plush + * User: jim + * Date: 9/22/11 + */ + +case class DepthTraversal(node: Element, parentDepth: Int, siblingDepth: Int) class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config: Configuration) extends ImageExtractor { import UpgradedImageIExtractor._ /** - * What's the minimum bytes for an image we'd accept is - */ + * What's the minimum bytes for an image we'd accept is + */ private val minBytesForImages: Int = 4000 /** - * the webpage url that we're extracting content from - */ + * the webpage url that we're extracting content from + */ val targetUrl = article.finalUrl /** - * stores a hash of our url for reference and image processing - */ + * stores a hash of our url for reference and image processing + */ val linkhash = article.linkhash /** - * this lists all the known bad button names that we have - */ + * this lists all the known bad button names that we have + */ val matchBadImageNames: Matcher = { val sb = new StringBuilder // create negative elements - sb.append(".html|.gif|.ico|button|twitter.jpg|facebook.jpg|ap_buy_photo|digg.jpg|digg.png|delicious.png|facebook.png|reddit.jpg|doubleclick|diggthis|diggThis|adserver|/ads/|ec.atdmt.com") + sb.append(".html|.gif|.ico|button|twitter.jpg|facebook.jpg|ap_buy_photo|digg.jpg|digg.png|delicious.png|facebook.png|reddit.jpg|doubleclick|diggthis|diggThis|adserver|/ads/|ec.atdmt.com") sb.append("|mediaplex.com|adsatt|view.atdmt") Pattern.compile(sb.toString()).matcher(string.empty) } def getBestImage(doc: Document, topNode: Element): Image = { - trace("Starting to Look for the Most Relavent Image") + trace("Starting to Look for the Most Relevant Image") checkForKnownElements() match { case Some(image) => return image case None => { @@ -57,6 +59,11 @@ class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config: } } + checkForMetaTag match { + case Some(image) => return image + case None => trace("No Meta Tag Images found") + } + checkForLargeImages(topNode, 0, 0) match { case Some(image) => return image case None => { @@ -64,40 +71,56 @@ class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config: } } - checkForMetaTag match { - case Some(image) => return image - case None => trace("No Meta Tag Images found") - } - new Image } + def getAllImages(node: Element): List[Image] = { + getImageCandidates(node) match { + case Some(goodImages) => { + val scoredImages = downloadImagesAndGetResults(goodImages, 0) + scoredImages.map((scoredImage: (LocallyStoredImage, Float)) => scoredImageToResultImage(scoredImage._1, scoredImages.size)).toList + } + case None => { + Nil + } + } + } + + /** + * Prefer Twitter images (as they tend to have the right size for us), then Open Graph images + * (which seem to be smaller), and finally linked images. + */ private def checkForMetaTag: Option[Image] = { - checkForLinkTag match { + + checkForTwitterTag match { case Some(image) => return Some(image) - case None => trace("No known images found") + case None => trace("No twitter image found") } checkForOpenGraphTag match { case Some(image) => return Some(image) - case None => trace("No known images found") + case None => trace("No open graph images found") + } + + checkForLinkTag match { + case Some(image) => return Some(image) + case None => trace("No link tag images found") } None } - /** - * although slow the best way to determine the best image is to download them and check the actual dimensions of the image when on disk - * so we'll go through a phased approach... - * 1. get a list of ALL images from the parent node - * 2. filter out any bad image names that we know of (gifs, ads, etc..) - * 3. do a head request on each file to make sure it meets our bare requirements - * 4. any images left over let's do a full GET request, download em to disk and check their dimensions - * 5. Score images based on different factors like height/width and possibly things like color density - * - * @param node - */ + * although slow the best way to determine the best image is to download them and check the actual dimensions of the image when on disk + * so we'll go through a phased approach... + * 1. get a list of ALL images from the parent node + * 2. filter out any bad image names that we know of (gifs, ads, etc..) + * 3. do a head request on each file to make sure it meets our bare requirements + * 4. any images left over let's do a full GET request, download em to disk and check their dimensions + * 5. Score images based on different factors like height/width and possibly things like color density + * + * @param node + */ private def checkForLargeImages(node: Element, parentDepthLevel: Int, siblingDepthLevel: Int): Option[Image] = { trace("Checking for large images - parent depth " + parentDepthLevel + " sibling depth: " + siblingDepthLevel) @@ -108,12 +131,7 @@ class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config: // get the high score image in a tuple scoredImages.sortBy(-_._2).take(1).headOption match { case Some(highScoreImage) => { - val mainImage = new Image - // mainImage.topImageNode = highScoreImage - mainImage.imageSrc = highScoreImage._1.imgSrc - mainImage.imageExtractionType = "bigimage" - mainImage.bytes = highScoreImage._1.bytes - mainImage.confidenceScore = if (scoredImages.size > 0) (100 / scoredImages.size) else 0 + val mainImage = scoredImageToResultImage(highScoreImage._1, scoredImages.size) trace("IMAGE COMPLETE: High Score Image is: " + mainImage.imageSrc + " Score is: " + highScoreImage._2) return Some(mainImage) } @@ -128,7 +146,6 @@ class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config: } } - } case None => { getDepthLevel(node, parentDepthLevel, siblingDepthLevel) match { @@ -143,6 +160,18 @@ class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config: None } + private def scoredImageToResultImage(scoredImage: LocallyStoredImage, scoredImagesLength: Int): Image = { + val mainImage = new Image + // mainImage.topImageNode = highScoreImage + mainImage.imageSrc = scoredImage.imgSrc + mainImage.imageExtractionType = "bigimage" + mainImage.bytes = scoredImage.bytes + mainImage.width = scoredImage.width + mainImage.height = scoredImage.height + mainImage.confidenceScore = if (scoredImagesLength > 0) (100 / scoredImagesLength) else 0 + mainImage + } + def getDepthLevel(node: Element, parentDepth: Int, siblingDepth: Int): Option[DepthTraversal] = { if (node == null) return None @@ -161,16 +190,16 @@ class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config: } /** - * download the images to temp disk and set their dimensions - *

- * we're going to score the images in the order in which they appear so images higher up will have more importance, - * we'll count the area of the 1st image as a score of 1 and then calculate how much larger or small each image after it is - * we'll also make sure to try and weed out banner type ad blocks that have big widths and small heights or vice versa - * so if the image is 3rd found in the dom it's sequence score would be 1 / 3 = .33 * diff in area from the first image - * - * @param images - * @return - */ + * download the images to temp disk and set their dimensions + *

+ * we're going to score the images in the order in which they appear so images higher up will have more importance, + * we'll count the area of the 1st image as a score of 1 and then calculate how much larger or small each image after it is + * we'll also make sure to try and weed out banner type ad blocks that have big widths and small heights or vice versa + * so if the image is 3rd found in the dom it's sequence score would be 1 / 3 = .33 * diff in area from the first image + * + * @param images + * @return + */ private def downloadImagesAndGetResults(images: ArrayList[Element], depthLevel: Int): ListBuffer[(LocallyStoredImage, Float)] = { val imageResults = new ListBuffer[(LocallyStoredImage, Float)]() var initialArea: Float = 0 @@ -186,8 +215,9 @@ class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config: height = locallyStoredImage.height if (height > MIN_HEIGHT) fileExtension = locallyStoredImage.fileExtension - if (fileExtension != ".gif" && fileExtension != "NA") - imageSrc = locallyStoredImage.imgSrc + //why not gif?: if (fileExtension != ".gif" && fileExtension != "NA") + if (fileExtension != "NA") + imageSrc = locallyStoredImage.imgSrc if ((depthLevel >= 1 && locallyStoredImage.width > 300) || depthLevel < 1) if (!isBannerDimensions(width, height)) } { @@ -198,8 +228,7 @@ class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config: // give the initial image a little area boost as well initialArea = area * 1.48f totalScore = 1 - } - else { + } else { val areaDifference: Float = area / initialArea totalScore = sequenceScore * areaDifference } @@ -211,22 +240,45 @@ class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config: } }) - - imageResults } - def getAllImages: ArrayList[Element] = { - null + def getAllImages(topNode: Element, parentDepthLevel: Int = 0, siblingDepthLevel: Int = 0): List[Image] = { + trace("getting All Images") + var images: ListBuffer[Image] = new ListBuffer() + getImageCandidates(topNode) match { + case Some(candidateImages) => { + for { + cadidateImg <- candidateImages + locallyStoredImg <- getLocallyStoredImage(buildImagePath(cadidateImg.attr("src"))) + } { + var img = new Image + img.imageSrc = locallyStoredImg.imgSrc + img.width = locallyStoredImg.width + img.height = locallyStoredImg.height + img.bytes = locallyStoredImg.bytes + images += img + } + return images.toList + } + case None => { + getDepthLevel(topNode, parentDepthLevel, siblingDepthLevel) match { + case Some(depthObj) => { + return getAllImages(depthObj.node, depthObj.parentDepth, depthObj.siblingDepth) + } + case None => return images.toList + } + } + } } /** - * returns true if we think this is kind of a bannery dimension - * like 600 / 100 = 6 may be a fishy dimension for a good image - * - * @param width - * @param height - */ + * returns true if we think this is kind of a bannery dimension + * like 600 / 100 = 6 may be a fishy dimension for a good image + * + * @param width + * @param height + */ private def isBannerDimensions(width: Int, height: Int): Boolean = { if (width == height) { return false @@ -257,18 +309,17 @@ class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config: } /** - * takes a list of image elements and filters out the ones with bad names - * - * @param images - * @return - */ + * takes a list of image elements and filters out the ones with bad names + * + * @param images + * @return + */ private def filterBadNames(images: Elements): Option[ArrayList[Element]] = { val goodImages: ArrayList[Element] = new ArrayList[Element] for (image <- images) { if (this.isOkImageFileName(image)) { goodImages.add(image) - } - else { + } else { image.remove() } } @@ -276,10 +327,10 @@ class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config: } /** - * will check the image src against a list of bad image files we know of like buttons, etc... - * - * @return - */ + * will check the image src against a list of bad image files we know of like buttons, etc... + * + * @return + */ private def isOkImageFileName(imageNode: Element): Boolean = { val imgSrc: String = imageNode.attr("src") if (string.isNullOrEmpty(imgSrc)) { @@ -303,18 +354,19 @@ class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config: filteredImages <- filterBadNames(images) goodImages <- findImagesThatPassByteSizeTest(filteredImages) } { - return Some(filteredImages) + //return Some(filteredImages) + return Some(goodImages) } None } /** - * loop through all the images and find the ones that have the best bytez to even make them a candidate - * - * @param images - * @return - */ + * loop through all the images and find the ones that have the best bytez to even make them a candidate + * + * @param images + * @return + */ private def findImagesThatPassByteSizeTest(images: ArrayList[Element]): Option[ArrayList[Element]] = { var cnt: Int = 0 val MAX_BYTES_SIZE: Int = 15728640 @@ -348,7 +400,6 @@ class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config: cnt += 1 }) - trace(" Now leaving findImagesThatPassByteSizeTest") if (goodImages == null || goodImages.isEmpty) None else Some(goodImages) @@ -359,10 +410,10 @@ class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config: } /** - * checks to see if we were able to find open graph tags on this page - * - * @return - */ + * checks to see if we were able to find open graph tags on this page + * + * @return + */ private def checkForLinkTag: Option[Image] = { if (article.rawDoc == null) return None @@ -388,11 +439,10 @@ class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config: } trace("link tag found, using it") - return Some(mainImage) + return ensureMinimumImageSize(mainImage) } None - } - catch { + } catch { case e: Exception => { warn("Unexpected exception caught in checkForLinkTag. Handled by returning None.", e) None @@ -402,10 +452,10 @@ class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config: } /** - * checks to see if we were able to find open graph tags on this page - * - * @return - */ + * checks to see if we were able to find open graph tags on this page + * + * @return + */ private def checkForOpenGraphTag: Option[Image] = { try { val meta: Elements = article.rawDoc.select("meta[property~=og:image]") @@ -429,11 +479,45 @@ class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config: } trace("open graph tag found, using it: %s".format(imagePath)) - return Some(mainImage) + return ensureMinimumImageSize(mainImage) } None + } catch { + case e: Exception => { + warn(e, e.toString) + None + } } - catch { + } + + private def checkForTwitterTag: Option[Image] = { + try { + val meta: Elements = article.rawDoc.select("meta[property~=twitter:image]") + + for (item <- meta) { + if (item.attr("content").length < 1) { + return None + } + val imagePath: String = this.buildImagePath(item.attr("content")) + val mainImage = new Image + mainImage.imageSrc = imagePath + mainImage.imageExtractionType = "twitter" + mainImage.confidenceScore = 100 + getLocallyStoredImage(mainImage.imageSrc) match { + case Some(locallyStoredImage) => { + mainImage.bytes = locallyStoredImage.bytes + mainImage.height = locallyStoredImage.height + mainImage.width = locallyStoredImage.width + } + case None => + } + trace("twitter image tag found, using it: %s".format(imagePath)) + + return ensureMinimumImageSize(mainImage) + } + None + + } catch { case e: Exception => { warn(e, e.toString) None @@ -441,23 +525,31 @@ class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config: } } + private def ensureMinimumImageSize(mainImage: Image): Option[Image] = { + trace("Checking image %s for proper size".format(mainImage.getImageSrc)) + if (mainImage.width >= config.minWidth && mainImage.height >= config.minHeight) { + trace("Image accepted") + return Some(mainImage) + } + trace("Image rejected as too small - actual size is %1$s wide by %2$s tall".format(mainImage.width, mainImage.height)) + None + } /** - * returns the bytes of the image file on disk - */ + * returns the bytes of the image file on disk + */ def getLocallyStoredImage(imageSrc: String): Option[LocallyStoredImage] = ImageUtils.storeImageToLocalFile(httpClient, linkhash, imageSrc, config) - def getCleanDomain = { // just grab the very end of the domain dotRegex.split(article.domain).takeRight(2).mkString(".") } /** - * in here we check for known image contains from sites we've checked out like yahoo, techcrunch, etc... that have - * known places to look for good images. - * //todo enable this to use a series of settings files so people can define what the image ids/classes are on specific sites - */ + * in here we check for known image contains from sites we've checked out like yahoo, techcrunch, etc... that have + * known places to look for good images. + * //todo enable this to use a series of settings files so people can define what the image ids/classes are on specific sites + */ def checkForKnownElements(): Option[Image] = { if (article.rawDoc == null) return None @@ -497,23 +589,22 @@ class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config: mainImage.width = locallyStoredImage.width }) - Some(mainImage) + ensureMinimumImageSize(mainImage) } /** - * This method will take an image path and build out the absolute path to that image - * using the initial url we crawled so we can find a link to the image if they use relative urls like ../myimage.jpg - * - * @param imageSrc - * @return - */ + * This method will take an image path and build out the absolute path to that image + * using the initial url we crawled so we can find a link to the image if they use relative urls like ../myimage.jpg + * + * @param imageSrc + * @return + */ private def buildImagePath(imageSrc: String): String = { try { val pageURL = new URL(this.targetUrl) return new URL(pageURL, ImageUtils.cleanImageSrcString(imageSrc)).toString - } - catch { + } catch { case e: MalformedURLException => { warn("Unable to get Image Path: " + imageSrc) } @@ -522,7 +613,6 @@ class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config: imageSrc } - } object UpgradedImageIExtractor { @@ -542,4 +632,4 @@ object UpgradedImageIExtractor { val KNOWN_IMG_DOM_NAMES = ListBuffer("yn-story-related-media", "cnn_strylccimg300cntr", "big_photo", "ap-smallphoto-a") -} \ No newline at end of file +} diff --git a/src/main/scala/com/gravity/goose/network/HtmlFetcher.scala b/src/main/scala/com/gravity/goose/network/HtmlFetcher.scala index 34ebf44a7..18b6aca59 100644 --- a/src/main/scala/com/gravity/goose/network/HtmlFetcher.scala +++ b/src/main/scala/com/gravity/goose/network/HtmlFetcher.scala @@ -18,38 +18,49 @@ package com.gravity.goose.network +import org.apache.http.Header +import org.apache.http.HeaderElement import org.apache.http.HttpEntity +import org.apache.http.HttpHost import org.apache.http.HttpResponse import org.apache.http.HttpVersion +import org.apache.http.{HttpRequest, HttpRequestInterceptor, HttpResponse, HttpResponseInterceptor, HeaderElementIterator} +import org.apache.http.client.entity.GzipDecompressingEntity import org.apache.http.client.CookieStore +import org.apache.http.impl.client.BasicCookieStore +import org.apache.http.client.entity.GzipDecompressingEntity import org.apache.http.client.HttpClient import org.apache.http.client.methods.HttpGet import org.apache.http.client.params.CookiePolicy import org.apache.http.client.protocol.ClientContext +import org.apache.http.conn.params.ConnRoutePNames +import org.apache.http.conn.ConnectionKeepAliveStrategy import org.apache.http.conn.scheme.PlainSocketFactory import org.apache.http.conn.ssl.SSLSocketFactory -import org.apache.http.conn.scheme.Scheme -import org.apache.http.conn.scheme.SchemeRegistry +import org.apache.http.conn.scheme.{Scheme, SchemeRegistry} import org.apache.http.cookie.Cookie -import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager -import org.apache.http.params.BasicHttpParams -import org.apache.http.params.HttpConnectionParams -import org.apache.http.params.HttpParams -import org.apache.http.params.HttpProtocolParams -import org.apache.http.protocol.BasicHttpContext -import org.apache.http.protocol.HttpContext -import org.apache.http.util.EntityUtils +import org.apache.http.impl.conn.PoolingClientConnectionManager +import org.apache.http.message.BasicHeaderElementIterator +import org.apache.http.params.{HttpParams, BasicHttpParams, HttpConnectionParams, HttpProtocolParams} +import org.apache.http.protocol.{HTTP, BasicHttpContext, HttpContext} +import org.apache.http.entity.ContentType import java.io._ import java.net.SocketException import java.net.SocketTimeoutException +import java.net.URL import java.net.URLConnection import java.util.ArrayList -import java.util.Date import java.util.List +import java.util.Date import com.gravity.goose.utils.Logging import com.gravity.goose.Configuration import org.apache.http.impl.client.{DefaultHttpRequestRetryHandler, AbstractHttpClient, DefaultHttpClient} - +import org.apache.commons.io.IOUtils +import com.ibm.icu.text.CharsetDetector +import org.apache.http.util.EntityUtils +import org.apache.http.conn.ClientConnectionManager +import com.gravity.goose.network.gae.GAEConnectionManager +import org.apache.http.conn.HttpClientConnectionManager /** * User: Jim Plush @@ -66,13 +77,13 @@ object HtmlFetcher extends AbstractHtmlFetcher with Logging { * cookies for head requests, only slows shit down */ var emptyCookieStore: CookieStore = null + /** * holds the HttpClient object for making requests */ private var httpClient: HttpClient = null initClient() - def getHttpClient: HttpClient = { httpClient } @@ -94,7 +105,8 @@ object HtmlFetcher extends AbstractHtmlFetcher with Logging { var htmlResult: String = null var entity: HttpEntity = null var instream: InputStream = null - + var contentType: ContentType = null + // Identified the the apache http client does not drop URL fragments before opening the request to the host // more info: http://stackoverflow.com/questions/4251841/400-error-with-httpclient-for-a-link-with-an-anchor val cleanUrl = { @@ -104,15 +116,17 @@ object HtmlFetcher extends AbstractHtmlFetcher with Logging { try { val localContext: HttpContext = new BasicHttpContext - localContext.setAttribute(ClientContext.COOKIE_STORE, HtmlFetcher.emptyCookieStore) + localContext.setAttribute(ClientContext.COOKIE_STORE, new BasicCookieStore) httpget = new HttpGet(cleanUrl) - HttpProtocolParams.setUserAgent(httpClient.getParams, config.getBrowserUserAgent()); + httpget.setHeader("referer", config.getBrowserReferer()) val params = httpClient.getParams + HttpProtocolParams.setUserAgent(params, config.getBrowserUserAgent()) + trace("Setting UserAgent To: " + HttpProtocolParams.getUserAgent(httpClient.getParams)) + HttpConnectionParams.setConnectionTimeout(params, config.getConnectionTimeout()) HttpConnectionParams.setSoTimeout(params, config.getSocketTimeout()) - trace("Setting UserAgent To: " + HttpProtocolParams.getUserAgent(httpClient.getParams)) val response: HttpResponse = httpClient.execute(httpget, localContext) HttpStatusValidator.validate(cleanUrl, response.getStatusLine.getStatusCode) match { @@ -121,15 +135,34 @@ object HtmlFetcher extends AbstractHtmlFetcher with Logging { } entity = response.getEntity + // via http://hc.apache.org/httpcomponents-client-ga/httpclient/examples/org/apache/http/examples/client/ClientGZipContentCompression.java if (entity != null) { - instream = entity.getContent - var encodingType: String = "UTF-8" try { - encodingType = EntityUtils.getContentCharSet(entity) - if (encodingType == null) { - encodingType = "UTF-8" + val ceheader: Header = entity.getContentEncoding(); + if (ceheader != null) { + val codecs: Array[HeaderElement] = ceheader.getElements(); + for(i <- 0 until codecs.length) { + if (codecs(i).getName().equalsIgnoreCase("gzip")) { + entity = new GzipDecompressingEntity(response.getEntity()) + } + } + } + } catch { + case e: Exception => { + trace("Unable to get header elements: " + cleanUrl) } } + } + + if (entity != null) { + instream = entity.getContent + val encodingType: String = config.resolveCharSet(url, entity) + + try { + contentType = ContentType.get(entity) + trace("Got contentType: " + contentType) + + } catch { case e: Exception => { if (logger.isDebugEnabled) { @@ -139,7 +172,7 @@ object HtmlFetcher extends AbstractHtmlFetcher with Logging { } } try { - htmlResult = HtmlFetcher.convertStreamToString(instream, 15728640, encodingType).trim + htmlResult = HtmlFetcher.convertStreamToString(instream, encodingType).trim } finally { EntityUtils.consume(entity) @@ -162,14 +195,15 @@ object HtmlFetcher extends AbstractHtmlFetcher with Logging { } case e: SocketTimeoutException => { trace(e.toString) + throw new GatewayTimeoutException(e.toString + " " + e.getMessage) } case e: LoggableException => { logger.warn(e.getMessage) - return None + throw e } case e: Exception => { - trace("FAILURE FOR LINK: " + cleanUrl + " " + e.toString) - return None + warn("FAILURE FOR LINK: " + cleanUrl + " " + e.toString) + throw e } } finally { @@ -186,6 +220,7 @@ object HtmlFetcher extends AbstractHtmlFetcher with Logging { if (httpget != null) { try { httpget.abort() + httpget.releaseConnection() entity = null } catch { @@ -208,8 +243,13 @@ object HtmlFetcher extends AbstractHtmlFetcher with Logging { try { is = new ByteArrayInputStream(htmlResult.getBytes("UTF-8")) mimeType = URLConnection.guessContentTypeFromStream(is) - if (mimeType != null) { - if ((mimeType == "text/html") == true || (mimeType == "application/xml") == true) { + if (mimeType != null || contentType != null) { + if(mimeType == null) { + mimeType = contentType.getMimeType() + trace("no guessed mimetype? using contentType: " + mimeType + " - " + cleanUrl) + } + + if ((mimeType == "text/html") || (mimeType == "application/xml") || (mimeType == "application/xhtml+xml") || (mimeType == "text/xml") ) { return Some(htmlResult) } else { @@ -219,8 +259,11 @@ object HtmlFetcher extends AbstractHtmlFetcher with Logging { trace("GRVBIGFAIL: " + mimeType + " - " + cleanUrl) throw new NotHtmlException(cleanUrl) } - } + + } + else { + trace("no mimetype?: " + mimeType + " - " + cleanUrl) throw new NotHtmlException(cleanUrl) } } @@ -240,8 +283,8 @@ object HtmlFetcher extends AbstractHtmlFetcher with Logging { trace("Initializing HttpClient") val httpParams: HttpParams = new BasicHttpParams - HttpConnectionParams.setConnectionTimeout(httpParams, 10 * 1000) - HttpConnectionParams.setSoTimeout(httpParams, 10 * 1000) + HttpConnectionParams.setConnectionTimeout(httpParams, 10 * 1000) // 10 seconds + HttpConnectionParams.setSoTimeout(httpParams, 10 * 1000) // 10 seconds HttpProtocolParams.setVersion(httpParams, HttpVersion.HTTP_1_1) emptyCookieStore = new CookieStore { def addCookie(cookie: Cookie) { @@ -260,24 +303,98 @@ object HtmlFetcher extends AbstractHtmlFetcher with Logging { private[network] var emptyList: ArrayList[Cookie] = new ArrayList[Cookie] } + httpParams.setParameter("http.protocol.single-cookie-header", true) httpParams.setParameter("http.protocol.cookie-policy", CookiePolicy.BROWSER_COMPATIBILITY) httpParams.setParameter("http.User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; de; rv:1.9.2.8) Gecko/20100723 Ubuntu/10.04 (lucid) Firefox/3.6.8") httpParams.setParameter("http.language.Accept-Language", "en-us") httpParams.setParameter("http.protocol.content-charset", "UTF-8") httpParams.setParameter("Accept", "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5") httpParams.setParameter("Cache-Control", "max-age=0") - httpParams.setParameter("http.connection.stalecheck", false) - val schemeRegistry: SchemeRegistry = new SchemeRegistry - schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory)) - schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory)) - val cm = new ThreadSafeClientConnManager(schemeRegistry) - cm.setMaxTotal(20000) - cm.setDefaultMaxPerRoute(500) + httpParams.setParameter("http.connection.stalecheck", true) +//gae??? + val cm = createConnectionManager + httpClient = new DefaultHttpClient(cm, httpParams) httpClient.asInstanceOf[AbstractHttpClient].setHttpRequestRetryHandler(new DefaultHttpRequestRetryHandler(0, false)) - httpClient.getParams.setParameter("http.conn-manager.timeout", 120000L) - httpClient.getParams.setParameter("http.protocol.wait-for-continue", 10000L) + httpClient.getParams.setParameter("http.connection-manager.timeout", 20000L) // timeout for retrieving a connection from the pool + httpClient.getParams.setParameter("http.protocol.wait-for-continue", 5000L) // timeout for how long the client waits for 100-continue before sending request body httpClient.getParams.setParameter("http.tcp.nodelay", true) + + // First check proxy configured from java properties, otherwise use env var if set + if (scala.sys.props.isDefinedAt("http.proxyHost")) { + val host = scala.sys.props.getOrElse("http.proxyHost", "") + val port = scala.sys.props.getOrElse("http.proxyPort", "80").toInt + httpClient.getParams.setParameter(ConnRoutePNames.DEFAULT_PROXY, new HttpHost(host, port)) + } else if (sys.env.isDefinedAt("http_proxy")) { + val url = new URL(sys.env.getOrElse("http_proxy", "")) + val host = url.getHost + val port = url.getPort + httpClient.getParams.setParameter(ConnRoutePNames.DEFAULT_PROXY, new HttpHost(host, port)) + } + + // http://hc.apache.org/httpcomponents-client-ga/httpclient/examples/org/apache/http/examples/client/ClientGZipContentCompression.java + httpClient.asInstanceOf[AbstractHttpClient].addRequestInterceptor(new HttpRequestInterceptor() { + def process( request: HttpRequest, context: HttpContext) { + if (!request.containsHeader("Accept-Encoding")) { + request.addHeader("Accept-Encoding", "gzip") + } + + // First check proxy configured from java properties, otherwise use env var if set + if (scala.sys.props.isDefinedAt("http.proxyHost")) { + val host = scala.sys.props.getOrElse("http.proxyHost", "") + val port = scala.sys.props.getOrElse("http.proxyPort", "80").toInt + httpClient.getParams.setParameter(ConnRoutePNames.DEFAULT_PROXY, new HttpHost(host, port)) + } else if (sys.env.isDefinedAt("http_proxy")) { + val url = new URL(sys.env.getOrElse("http_proxy", "")) + val host = url.getHost + val port = url.getPort + httpClient.getParams.setParameter(ConnRoutePNames.DEFAULT_PROXY, new HttpHost(host, port)) + } + } + }) + + httpClient.asInstanceOf[AbstractHttpClient].addResponseInterceptor(new HttpResponseInterceptor() { + def process( response: HttpResponse, context: HttpContext) { + val entity: HttpEntity = response.getEntity() + if (entity != null) { + val ceheader: Header = entity.getContentEncoding() + if (ceheader != null) { + val codecs = ceheader.getElements() + for ( c <- codecs) { + if (c.getName().equalsIgnoreCase("gzip")) { + response.setEntity( + new GzipDecompressingEntity(response.getEntity())) + return + } + } + } + } + } + }) + + httpClient.asInstanceOf[AbstractHttpClient].setKeepAliveStrategy(new ConnectionKeepAliveStrategy() { + def getKeepAliveDuration( response: HttpResponse, context: HttpContext): Long = { + // Honor 'keep-alive' header + val it: HeaderElementIterator = new BasicHeaderElementIterator(response.headerIterator(HTTP.CONN_KEEP_ALIVE)) + + while (it.hasNext()) { + val he: HeaderElement = it.nextElement() + val param: String = he.getName() + val value: String = he.getValue() + if (value != null && param.equalsIgnoreCase("timeout")) { + try { + return value.toLong * 1000 + } catch { + case e: NumberFormatException => {} // ignore numberformat errors + } + } + } + + // otherwise keep alive for 10 seconds + return 10 * 1000 + } + }) + } /** @@ -287,52 +404,52 @@ object HtmlFetcher extends AbstractHtmlFetcher with Logging { * @param maxBytes The max bytes that we want to read from the input stream * @return String */ - def convertStreamToString(is: InputStream, maxBytes: Int, encodingType: String): String = { - val buf: Array[Char] = new Array[Char](2048) - var r: Reader = null - val s = new StringBuilder + def convertStreamToString(is: InputStream, httpEncodingType: String): String = { try { - r = new InputStreamReader(is, encodingType) - var bytesRead: Int = 2048 - var inLoop = true - while (inLoop) { - if (bytesRead >= maxBytes) { - throw new MaxBytesException - } - var n: Int = r.read(buf) - bytesRead += 2048 + var buf : Array[Byte] = IOUtils.toByteArray(is) + return encodedText(buf) - if (n < 0) inLoop = false - if (inLoop) s.appendAll(buf, 0, n) - } - return s.toString() } + catch { case e: SocketTimeoutException => { logger.warn(e.toString + " " + e.getMessage) } case e: UnsupportedEncodingException => { - logger.warn(e.toString + " Encoding: " + encodingType) + logger.warn(e.toString + " " + e.getMessage) } case e: IOException => { logger.warn(e.toString + " " + e.getMessage) } } - finally { - if (r != null) { - try { - r.close() - } - catch { - case e: Exception => { - } - } - } - } null } + + def encodedText(buf : Array[Byte]) : String = { + val detector = new CharsetDetector() + detector.setText(buf) + val matched = detector.detect() + matched.getLanguage + matched.getString + } + def createConnectionManager:ClientConnectionManager = createDefaultConnectionManager + //enable gae connection manager + //def createConnectionManager:ClientConnectionManager = createGaeConnectionManager + + def createGaeConnectionManager = new GAEConnectionManager + def createDefaultConnectionManager:ClientConnectionManager = { + val schemeRegistry: SchemeRegistry = new SchemeRegistry + schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory)) + schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory)) + //gae?? + // val cm = new ThreadSafeClientConnManager(schemeRegistry) + val cm = new PoolingClientConnectionManager(schemeRegistry) + cm.setMaxTotal(4000) + cm.setDefaultMaxPerRoute(20) + cm + } } diff --git a/src/main/scala/com/gravity/goose/network/HttpExceptions.scala b/src/main/scala/com/gravity/goose/network/HttpExceptions.scala index 52a4ded54..444d46d30 100644 --- a/src/main/scala/com/gravity/goose/network/HttpExceptions.scala +++ b/src/main/scala/com/gravity/goose/network/HttpExceptions.scala @@ -18,10 +18,13 @@ class LoggableException(msg: String, innerEx: Exception = null) extends Exceptio } } +class NoArticleException(url: String) extends LoggableException("Couldn't find article for url: " + url) +class ArticleParseException(url: String) extends LoggableException("Couldn't parse article for url: " + url) class NotFoundException(url: String) extends LoggableException("SERVER RETURNED 404 FOR LINK: " + url) class BadRequestException(url: String) extends LoggableException("Bad Request for URL: " + url) class NotAuthorizedException(url: String, statusCode: Int = 403) extends LoggableException("Not authorized (statusCode: %d) to access URL: %s".format(statusCode, url)) class ServerErrorException(url: String, statusCode: Int = 500) extends LoggableException("Server Error! Status code returned: %d for URL: %s".format(statusCode, url)) +class GatewayTimeoutException(url: String) extends LoggableException("Server Error! Timeout reading URL: %s".format(url)) class UnhandledStatusCodeException(url: String, statusCode: Int) extends LoggableException("Received HTTP statusCode: %d from URL: %s and did not know how to handle it!".format(statusCode, url)) object HttpStatusValidator { diff --git a/src/main/scala/com/gravity/goose/network/gae/GAEClientConnection.scala b/src/main/scala/com/gravity/goose/network/gae/GAEClientConnection.scala new file mode 100644 index 000000000..4eb787462 --- /dev/null +++ b/src/main/scala/com/gravity/goose/network/gae/GAEClientConnection.scala @@ -0,0 +1,175 @@ +package com.gravity.goose.network + +import org.apache.http._ +import org.apache.http.conn.ClientConnectionManager +import org.apache.http.conn.ManagedClientConnection +import org.apache.http.conn.routing.HttpRoute +import org.apache.http.entity.ByteArrayEntity +import org.apache.http.message.BasicHttpResponse +import org.apache.http.params.HttpParams +import org.apache.http.protocol.HttpContext +import com.google.appengine.api.urlfetch._ +import java.io.ByteArrayOutputStream +import java.io.IOException +import java.net.InetAddress +import java.net.URI +import java.net.URISyntaxException +import java.util.concurrent.TimeUnit +import GAEClientConnection._ +import scala.beans._ +//import scala.reflect.{ BeanProperty, BooleanBeanProperty } +import scala.collection.JavaConversions._ +import org.apache.http.conn.HttpClientConnectionManager +import java.net.Socket + +object GAEClientConnection { + + private var urlFS: URLFetchService = URLFetchServiceFactory.getURLFetchService +} + +class GAEClientConnection(cm: ClientConnectionManager, @BeanProperty var route: HttpRoute, @BeanProperty var state: AnyRef) extends ManagedClientConnection { + def getId(): String = ??? + def bind(socket: Socket) = ??? + def getSocket() = ??? + override def isSecure(): Boolean = route.isSecure + + override def getSSLSession(): javax.net.ssl.SSLSession = null + + override def open(route: HttpRoute, context: HttpContext, params: HttpParams) { + close() + this.route = route + } + + override def tunnelTarget(secure: Boolean, params: HttpParams) { + throw new IOException("tunnelTarget() not supported") + } + + override def tunnelProxy(next: HttpHost, secure: Boolean, params: HttpParams) { + throw new IOException("tunnelProxy() not supported") + } + + override def layerProtocol(context: HttpContext, params: HttpParams) { + throw new IOException("layerProtocol() not supported") + } + + override def markReusable() { + reusable = true + } + + override def unmarkReusable() { + reusable = false + } + + override def isMarkedReusable(): Boolean = reusable + + override def setIdleDuration(duration: Long, unit: TimeUnit) { + } + + override def isResponseAvailable(timeout: Int): Boolean = response != null + + override def sendRequestHeader(request: HttpRequest) { + val host = route.getTargetHost + val uri = new URI(host.getSchemeName + "://" + host.getHostName + (if ((host.getPort == -1)) "" else (":" + host.getPort)) + request.getRequestLine.getUri) + this.request = new HTTPRequest(uri.toURL(), HTTPMethod.valueOf(request.getRequestLine.getMethod), FetchOptions.Builder.disallowTruncate()) + for (h <- request.getAllHeaders) { + this.request.addHeader(new HTTPHeader(h.getName, h.getValue)) + } + } + + override def sendRequestEntity(request: HttpEntityEnclosingRequest) { + val baos = new ByteArrayOutputStream() + if (request.getEntity != null) { + request.getEntity.writeTo(baos) + } + this.request.setPayload(baos.toByteArray()) + } + + override def receiveResponseHeader(): HttpResponse = { + if (this.response == null) { + flush() + } + val response = new BasicHttpResponse(new ProtocolVersion("HTTP", 1, 1), this.response.getResponseCode, null) + for (h <- this.response.getHeaders) { + response.addHeader(h.getName, h.getValue) + } + response + } + + override def receiveResponseEntity(response2: HttpResponse) { + //TODO review response2 and this.response + if (this.response == null) { + throw new IOException("receiveResponseEntity() called on closed connection") + } + val bae = new ByteArrayEntity(this.response.getContent) + bae.setContentType(response2.getFirstHeader("Content-Type")) + response2.setEntity(bae) + this.response = null + } + + override def flush() { + if (request != null) { + try { + response = urlFS.fetch(request) + request = null + } catch { + case ex: IOException => { + ex.printStackTrace() + throw ex + } + } + } else { + response = null + } + } + + override def close() { + request = null + response = null + closed = true + } + + override def isOpen(): Boolean = request != null || response != null + + override def isStale(): Boolean = !isOpen && !closed + + override def setSocketTimeout(timeout: Int) { + } + + override def getSocketTimeout(): Int = -1 + + override def shutdown() { + close() + } + + override def getMetrics(): HttpConnectionMetrics = null + + override def getLocalAddress(): InetAddress = null + + override def getLocalPort(): Int = 0 + + override def getRemoteAddress(): InetAddress = null + + override def getRemotePort(): Int = { + val host = route.getTargetHost + connManager.getSchemeRegistry.getScheme(host).resolvePort(host.getPort) + } + + override def releaseConnection() { + connManager.releaseConnection(this, java.lang.Long.MAX_VALUE, TimeUnit.MILLISECONDS) + } + + override def abortConnection() { + unmarkReusable() + shutdown() + } + + private var connManager: ClientConnectionManager = cm + + private var reusable: Boolean = _ + + private var request: HTTPRequest = _ + + private var response: HTTPResponse = _ + + private var closed: Boolean = true +} diff --git a/src/main/scala/com/gravity/goose/network/gae/GAEConnectionManager.scala b/src/main/scala/com/gravity/goose/network/gae/GAEConnectionManager.scala new file mode 100644 index 000000000..5e8f8819a --- /dev/null +++ b/src/main/scala/com/gravity/goose/network/gae/GAEConnectionManager.scala @@ -0,0 +1,65 @@ +package com.gravity.goose.network.gae + +import org.apache.http.conn.ClientConnectionManager +import org.apache.http.conn.ClientConnectionRequest +import org.apache.http.conn.ManagedClientConnection +import org.apache.http.conn.routing.HttpRoute +import org.apache.http.conn.scheme.Scheme +import org.apache.http.conn.scheme.SchemeRegistry +import org.apache.http.conn.scheme.SchemeSocketFactory +import org.apache.http.params.HttpParams +import java.net.InetSocketAddress +import java.net.Socket +import java.util.concurrent.TimeUnit +//import scala.reflect.BeanProperty +import scala.beans.BeanProperty +import com.gravity.goose.network.GAEClientConnection +import org.apache.http.conn.HttpClientConnectionManager + +class GAEConnectionManager extends ClientConnectionManager { + @BeanProperty + var schemeRegistry: SchemeRegistry = new SchemeRegistry() + + val no_socket_factory = new SchemeSocketFactory() { + + override def isSecure(sock: Socket): Boolean = false + + override def createSocket(params: HttpParams): Socket = null + + override def connectSocket(sock: Socket, remoteAddress: InetSocketAddress, localAddress: InetSocketAddress, params: HttpParams): Socket = { + null + } + } + + schemeRegistry.register(new Scheme("http", 80, no_socket_factory)) + + schemeRegistry.register(new Scheme("https", 443, no_socket_factory)) + + override def requestConnection(route: HttpRoute, state: AnyRef): ClientConnectionRequest = { + new ClientConnectionRequest() { + + def abortRequest() { + } + + def getConnection(timeout: Long, tunit: TimeUnit): ManagedClientConnection = { + GAEConnectionManager.this.getConnection(route, state) + } + } + } + + override def releaseConnection(conn: ManagedClientConnection, validDuration: Long, timeUnit: TimeUnit) { + } + + override def closeIdleConnections(idletime: Long, tunit: TimeUnit) { + } + + override def closeExpiredConnections() { + } + + override def shutdown() { + } + + private def getConnection(route: HttpRoute, state: AnyRef): ManagedClientConnection = { + new GAEClientConnection(this, route, state) + } +} diff --git a/src/main/scala/com/gravity/goose/opengraph/OpenGraphData.scala b/src/main/scala/com/gravity/goose/opengraph/OpenGraphData.scala new file mode 100644 index 000000000..f7e0aec17 --- /dev/null +++ b/src/main/scala/com/gravity/goose/opengraph/OpenGraphData.scala @@ -0,0 +1,35 @@ +/** +Copyright [2014] Robby Pond + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package com.gravity.goose.opengraph; + +import scala.collection.mutable.Set +import com.github.nscala_time.time.Imports._ + +case class OpenGraphData() { + var title: String = "" + var siteName: String = "" + var url: String = "" + var description: String = "" + var image: String = "" + var ogType: String = "" + var locale: String = "" + var author: String = "" + var publisher: String = "" + var publishedTime : DateTime = null + var modifiedTime : DateTime = null + var tags : Set[String] = Set() + var section : String = "" +} diff --git a/src/main/scala/com/gravity/goose/outputformatters/OutputFormatter.scala b/src/main/scala/com/gravity/goose/outputformatters/OutputFormatter.scala index a86a2252b..e887b3683 100644 --- a/src/main/scala/com/gravity/goose/outputformatters/OutputFormatter.scala +++ b/src/main/scala/com/gravity/goose/outputformatters/OutputFormatter.scala @@ -24,6 +24,7 @@ import org.jsoup.select.Elements import com.gravity.goose.text.StopWords import scala.collection.JavaConversions._ import org.slf4j.Logger +import com.gravity.goose.Language._ /** * Created by Jim Plush @@ -49,11 +50,13 @@ trait OutputFormatter { * @param topNode the top most node to format * @return the prepared Element */ - @Deprecated def getFormattedElement(topNode: Element): Element = { +// @Deprecated def getFormattedElement(topNode: Element, language: Language): Element = { + @Deprecated def getFormattedElement(topNode: Element, lang: String): Element = { removeNodesWithNegativeScores(topNode) convertLinksToText(topNode) replaceTagsWithText(topNode) - removeParagraphsWithFewWords(topNode) +// removeParagraphsWithFewWords(topNode, language) + removeParagraphsWithFewWords(topNode, lang) topNode } @@ -62,12 +65,17 @@ trait OutputFormatter { * @param topNode the top most node to format * @return a formatted string with all HTML removed */ - def getFormattedText(topNode: Element): String = { - removeNodesWithNegativeScores(topNode) - convertLinksToText(topNode) - replaceTagsWithText(topNode) - removeParagraphsWithFewWords(topNode) - convertToText(topNode) +// def getFormattedText(topNode: Element, language: Language): String = { + def getFormattedText(topNode: Element, lang: String): String = { + //with clonning replacing text cannot happen since nodes don't have a parent and this trigger exceptions in jsoup: convertLinksToText/item.replaceWith(tn) + //var node = topNode.clone + val node = topNode + removeNodesWithNegativeScores(node) + convertLinksToText(node) + replaceTagsWithText(node) +// removeParagraphsWithFewWords(node, language) + removeParagraphsWithFewWords(node, lang) + convertToText(node) } /** @@ -80,12 +88,39 @@ trait OutputFormatter { case null => "" case node => { (node.children().map((e: Element) => { - StringEscapeUtils.unescapeHtml(e.text).trim + var text = StringEscapeUtils.unescapeHtml(e.text).trim + text })).toList.mkString("\n\n") } } + /** + * Scape the node content and return the html + * @param topNode the top most node to format + * @return a formatted string with all HTML + */ + //def cleanupHtml(topNode: Element, language: Language): String = { + def cleanupHtml(topNode: Element, language: String): String = { + //with clonning replacing text cannot happen since nodes don't have a parent and this trigge + //val node = topNode.clone + val node = topNode + removeParagraphsWithFewWords(node, language) + convertToHtml(node) + } + + private def convertToHtml(topNode: Element): String = topNode match { + case null => "" + case node => { + StringEscapeUtils.unescapeHtml(node.html).trim + + (node.children().map((e: Element) => { + // FIXTHIS - Use some jsoup class to do this + "

" + StringEscapeUtils.unescapeHtml(e.html).trim + "

" + })).mkString + } + } + /** * cleans up and converts any nodes that should be considered text into text */ @@ -173,18 +208,21 @@ trait OutputFormatter { /** * remove paragraphs that have less than x number of words, would indicate that it's some sort of link */ - private def removeParagraphsWithFewWords(topNode: Element) { +// private def removeParagraphsWithFewWords(topNode: Element, language: Language) { + private def removeParagraphsWithFewWords(topNode: Element, lang: String) { if (topNode != null) { if (logger.isDebugEnabled) { logger.debug("removeParagraphsWithFewWords starting...") } - val allNodes = topNode.getAllElements + val paragraphs = topNode.getElementsByTag("p") - for (el <- allNodes) { + for (el <- paragraphs) { try { - val stopWords = StopWords.getStopWordCount(el.text) - if (stopWords.getStopWordCount < 3 && el.getElementsByTag("object").size == 0 && el.getElementsByTag("embed").size == 0) { +// val stopWords = StopWords.getStopWordCount(el.text, language) +// if (el.text.size < 8 && stopWords.getStopWordCount < 3 && el.getElementsByTag("object").size == 0 && el.getElementsByTag("embed").size == 0) { + val stopWords = StopWords.getStopWordCount(el.text, lang) + if (el.text.size < 8 && stopWords.getStopWordCount < 3 && el.getElementsByTag("object").size == 0 && el.getElementsByTag("embed").size == 0) { logger.debug("removeParagraphsWithFewWords - swcnt: %d removing text: %s".format(stopWords.getStopWordCount, el.text())) el.remove() } @@ -208,4 +246,4 @@ trait OutputFormatter { } } } -} \ No newline at end of file +} diff --git a/src/main/scala/com/gravity/goose/text/StopWords.scala b/src/main/scala/com/gravity/goose/text/StopWords.scala index f1b7ea94d..3d9886bbe 100644 --- a/src/main/scala/com/gravity/goose/text/StopWords.scala +++ b/src/main/scala/com/gravity/goose/text/StopWords.scala @@ -24,39 +24,108 @@ package com.gravity.goose.text * Date: 8/16/11 */ -import java.util._ import com.gravity.goose.utils.FileHelper +import com.gravity.goose.Language._ +import com.chenlb.mmseg4j.ComplexSeg +import com.chenlb.mmseg4j.Dictionary +import com.chenlb.mmseg4j.MMSeg +import com.chenlb.mmseg4j.Seg +import com.chenlb.mmseg4j.Word +import java.io.StringReader +import scala.collection.JavaConversions._ +import java.util.HashMap +import scala.collection.Set +import java.util.Map +import com.gravity.goose.Language object StopWords { // the confusing pattern below is basically just match any non-word character excluding white-space. private val PUNCTUATION: StringReplacement = StringReplacement.compile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]", string.empty) - val STOP_WORDS = FileHelper.loadResourceFile("stopwords-en.txt", StopWords.getClass).split(sys.props("line.separator")).toSet +//raisercostin: use the other method of memoising the languages on first access + // TODO: there must a better way to do this. See + // http://www.uofr.net/~greg/java/get-resource-listing.html? +// val LANGUAGES: Set[String] = Set("ar", "da", "de", "en", "es", "fi", "fr", +// "hu", "id", "it", "ko", "nb", "nl", "no", +// "pl", "pt", "ru", "sv", "zh") +// +// val stopWordsMap: Map[String, Set[String]] = +// (LANGUAGES.view map {lang => +// lang -> +// FileHelper.loadResourceFile("stopwords-" + lang + ".txt", +// StopWords.getClass).split(sys.props("line.separator")).toSet +// }).toMap.withDefaultValue(Set()) + //val STOP_WORDS = FileHelper.loadResourceFile("stopwords-en.txt", StopWords.getClass).split(sys.props("line.separator")).toSet + private var stopWordsMap: Map[String, Set[String]] = new HashMap[String, Set[String]]() def removePunctuation(str: String): String = { PUNCTUATION.replaceAll(str) } + + def getStopWords(language: Language): Set[String] = getStopWords(language.toString) - def getStopWordCount(content: String): WordStats = { + def getStopWords(lname: String): Set[String] = { + + var stopWords = stopWordsMap.get(lname) + if (stopWords == null) { + var stopWordsFile = "stopwords-%s.txt" format lname + stopWords = FileHelper.loadResourceFile(stopWordsFile, StopWords.getClass).split(sys.props("line.separator")).toSet + stopWords = stopWords.map(s=>s.trim) + stopWordsMap.put(lname, stopWords) + } + stopWords + } + def getCandidateWords(strippedInput: String, language: String): Array[String] = getCandidateWords(strippedInput, + Language(language)) + + def getCandidateWords(strippedInput: String, language: Language): Array[String] = { + language match { + case English => string.SPACE_SPLITTER.split(strippedInput) + case Chinese => tokenize(strippedInput).toArray + case _ => string.SPACE_SPLITTER.split(strippedInput) + } + } + + def getStopWordCount(content: String, lang: String = "en"): WordStats = { +// def getStopWordCount(content: String, language: Language): WordStats = { if (string.isNullOrEmpty(content)) return WordStats.EMPTY val ws: WordStats = new WordStats val strippedInput: String = removePunctuation(content) - val candidateWords: Array[String] = string.SPACE_SPLITTER.split(strippedInput) - - val overlappingStopWords: List[String] = new ArrayList[String] + //val candidateWords = getCandidateWords(strippedInput, language) + val candidateWords = getCandidateWords(strippedInput, lang) + + var overlappingStopWords: List[String] = List[String]() +// val stopWords = getStopWords(language) + val stopWords = getStopWords(lang) +if (stopWords.size > 0) { + //scala-ify? overlappingStopWords = candidateWords.filter(w=>stopWords.contains(w.toLowerCase)).map(w=>w.toLowerCase) candidateWords.foreach(w => { - if (STOP_WORDS.contains(w.toLowerCase)) overlappingStopWords.add(w.toLowerCase) + if (stopWords.contains(w.toLowerCase)) { + overlappingStopWords = w.toLowerCase :: overlappingStopWords + } }) +} ws.setWordCount(candidateWords.length) ws.setStopWordCount(overlappingStopWords.size) ws.setStopWords(overlappingStopWords) ws } - - -} + + def tokenize(line: String): List[String] = { + + var seg = new ComplexSeg(Dictionary.getInstance()); + var mmSeg = new MMSeg(new StringReader(line), seg); + var tokens = List[String](); + var word = mmSeg.next() + while (word != null) { + tokens = word.getString() :: tokens ; + word = mmSeg.next(); + } + return tokens; + } +} \ No newline at end of file diff --git a/src/main/scala/com/gravity/goose/text/WordStats.scala b/src/main/scala/com/gravity/goose/text/WordStats.scala index d6b6006c4..d6caad4d6 100644 --- a/src/main/scala/com/gravity/goose/text/WordStats.scala +++ b/src/main/scala/com/gravity/goose/text/WordStats.scala @@ -21,6 +21,8 @@ package com.gravity.goose.text import java.util.ArrayList import java.util.List +import scala.collection.JavaConversions._ + /** * User: Jim Plush * Date: Oct 29, 2010 @@ -30,23 +32,23 @@ object WordStats { var EMPTY: WordStats = new WordStats } -class WordStats { - - +class WordStats(_stopWords:List[String], _wordCount:Int) { import WordStats._ - + def this() = this(new ArrayList(), 0) /** * total number of stopwords or good words that we can calculate */ - var stopWordCount: Int = 0 + var stopWordCount : Int = _stopWords.size() + /** * total number of words on a node */ - var wordCount: Int = 0 + var wordCount: Int = _wordCount + /** * holds an actual list of the stop words we found */ - var stopWords: List[String] = new ArrayList[String] + var stopWords: List[String] = _stopWords def getStopWords: List[String] = { stopWords @@ -72,6 +74,8 @@ class WordStats { wordCount = cnt } - + override def toString: String = + "Word statistics: words = " + wordCount + ", stop words = " + + stopWordCount + " (" + stopWords.mkString(", ") + ")" } diff --git a/src/main/scala/com/gravity/goose/utils/FileHelper.scala b/src/main/scala/com/gravity/goose/utils/FileHelper.scala index 2fb434832..c6adc2cd7 100644 --- a/src/main/scala/com/gravity/goose/utils/FileHelper.scala +++ b/src/main/scala/com/gravity/goose/utils/FileHelper.scala @@ -37,7 +37,8 @@ object FileHelper extends Logging { filedata = IOUtils.toString(is, "UTF-8") } catch { - case e: IOException => warn(e, e.toString) + case e: IOException => warn(s"Error while reading $filename: "+e, e.toString) + case e: NullPointerException => warn(s"Error while reading $filename: "+e, e.toString) } filedata } diff --git a/src/main/scala/com/gravity/goose/utils/JsonUtil.scala b/src/main/scala/com/gravity/goose/utils/JsonUtil.scala new file mode 100644 index 000000000..4ce77bd7e --- /dev/null +++ b/src/main/scala/com/gravity/goose/utils/JsonUtil.scala @@ -0,0 +1,59 @@ +package com.gravity.goose.util + +import java.io._ +import com.fasterxml.jackson.core._ +import com.fasterxml.jackson.databind._ +import com.fasterxml.jackson.databind.SerializationFeature +import com.fasterxml.jackson.module.scala.DefaultScalaModule + +object JsonUtil { + + private val LOG = org.slf4j.LoggerFactory.getLogger("JsonUtil") + + private val mapper = create() + + private def create(): ObjectMapper = { + val mapper = new ObjectMapper() + mapper.configure(SerializationFeature.INDENT_OUTPUT, true) + mapper.registerModule(DefaultScalaModule) + mapper + } + + def toJson[T](data: T): String = { + try { + mapper.writeValueAsString(data) + } catch { + case e: IOException => { + LOG.warn("can't format a json object from [" + data + "]", e) + null + } + } + } + + def toJsonNode[T](data: T): JsonNode = mapper.valueToTree(data) + + def fromJson[T](description: String, theClass: Class[T]): T = { + //PlayUtils.fixClassloader(theClass) + val parse = mapper.readValue(description, classOf[JsonNode]) + val fromJson = mapper.treeToValue(parse, theClass) + fromJson + } + + private def shorter(description: String): String = { + val maxSize = 1000 + if (description == null || description.length < maxSize) { + return description + } + description.substring(0, maxSize - 3) + "..." + } + + def copy[T](data: T): T = { + fromJson(toJson(data), data.getClass.asInstanceOf[Class[T]]) + } + + def clone[T](`object`: T, excludeFields: String): T = { + val exported = JsonUtil.toJson(`object`) + val obj = JsonUtil.fromJson(exported, `object`.getClass) + obj.asInstanceOf[T] + } +} diff --git a/src/main/scala/com/gravity/goose/utils/URLHelper.scala b/src/main/scala/com/gravity/goose/utils/URLHelper.scala index e6966648f..cd8881970 100644 --- a/src/main/scala/com/gravity/goose/utils/URLHelper.scala +++ b/src/main/scala/com/gravity/goose/utils/URLHelper.scala @@ -21,6 +21,7 @@ package com.gravity.goose.utils import com.gravity.goose.text.{StringReplacement, HashUtils} import java.net.{URI, MalformedURLException, URL} import org.apache.http.client.methods.HttpGet +import java.io.{StringWriter,PrintWriter} /** * Created by Jim Plush @@ -57,13 +58,21 @@ object URLHelper extends Logging { urlToCrawl, unknown.getClass.getCanonicalName, unknown.getMessage, - unknown.getStackTraceString) + getStackTraceString(unknown)) None } } } + def getStackTraceString(e: Exception) = { + val stringWriter = new StringWriter + val printWriter = new PrintWriter(stringWriter) + e.printStackTrace(printWriter) + stringWriter.toString + } + + def tryToURL(url: String): Option[URL] = { val finalUrl = if (url.contains("#!")) { ESCAPED_FRAGMENT_REPLACEMENT.replaceAll(url) @@ -98,4 +107,4 @@ object URLHelper extends Logging { case None => None } } -} \ No newline at end of file +} diff --git a/src/main/resources/com/gravity/goose/statichtml/aol1.txt b/src/test/resources/com/gravity/goose/statichtml/aol1.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/aol1.txt rename to src/test/resources/com/gravity/goose/statichtml/aol1.txt diff --git a/src/test/resources/com/gravity/goose/statichtml/bug1.html b/src/test/resources/com/gravity/goose/statichtml/bug1.html new file mode 100644 index 000000000..09d0d1c2d --- /dev/null +++ b/src/test/resources/com/gravity/goose/statichtml/bug1.html @@ -0,0 +1,1490 @@ + + + + Producatori - SC DAROMFARM SRL + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ +
+ +
+

Categorii

+
+ + +
+
+ + +
+

+ Producători +

+
+ +
+
+ + + + +
+

Newsletter

+
+
+

+ +

+

+ + +

+
+
+
+ + +
+

Caută

+ +
+ + + + + + +
+ +
+ + + +

Producători

+

+ Sunt 48 producători. +

+ + + + +
+ +
+ + +
+

+ Coş + +   +

+
+ + + +
+

Nici un produs

+

+ Livrare + 0,00 lei +
+ Total + 0,00 lei +

+

+ Plăteşte +

+
+
+
+ + +
+ + comenzi telefinice + +
+ + +
+ + Urmariti-ne pe facebook + +
+ + + + + +
+

+ Reduceri +

+
+ +

+ Toate reducerile +

+
+
+ + +
+

Caută

+ +
+ + + +
+
+ + +
+ + + + diff --git a/src/main/resources/com/gravity/goose/statichtml/businessinsider1.txt b/src/test/resources/com/gravity/goose/statichtml/businessinsider1.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/businessinsider1.txt rename to src/test/resources/com/gravity/goose/statichtml/businessinsider1.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/businessinsider2.txt b/src/test/resources/com/gravity/goose/statichtml/businessinsider2.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/businessinsider2.txt rename to src/test/resources/com/gravity/goose/statichtml/businessinsider2.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/businessweek1.txt b/src/test/resources/com/gravity/goose/statichtml/businessweek1.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/businessweek1.txt rename to src/test/resources/com/gravity/goose/statichtml/businessweek1.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/businessweek2.txt b/src/test/resources/com/gravity/goose/statichtml/businessweek2.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/businessweek2.txt rename to src/test/resources/com/gravity/goose/statichtml/businessweek2.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/businessweek3.txt b/src/test/resources/com/gravity/goose/statichtml/businessweek3.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/businessweek3.txt rename to src/test/resources/com/gravity/goose/statichtml/businessweek3.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/cnbc1.txt b/src/test/resources/com/gravity/goose/statichtml/cnbc1.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/cnbc1.txt rename to src/test/resources/com/gravity/goose/statichtml/cnbc1.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/cnet1.txt b/src/test/resources/com/gravity/goose/statichtml/cnet1.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/cnet1.txt rename to src/test/resources/com/gravity/goose/statichtml/cnet1.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/cnn1.txt b/src/test/resources/com/gravity/goose/statichtml/cnn1.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/cnn1.txt rename to src/test/resources/com/gravity/goose/statichtml/cnn1.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/engadget1.txt b/src/test/resources/com/gravity/goose/statichtml/engadget1.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/engadget1.txt rename to src/test/resources/com/gravity/goose/statichtml/engadget1.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/espn1.txt b/src/test/resources/com/gravity/goose/statichtml/espn1.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/espn1.txt rename to src/test/resources/com/gravity/goose/statichtml/espn1.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/foxnews1.txt b/src/test/resources/com/gravity/goose/statichtml/foxnews1.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/foxnews1.txt rename to src/test/resources/com/gravity/goose/statichtml/foxnews1.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/guardian1.txt b/src/test/resources/com/gravity/goose/statichtml/guardian1.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/guardian1.txt rename to src/test/resources/com/gravity/goose/statichtml/guardian1.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/guardian1_result.txt b/src/test/resources/com/gravity/goose/statichtml/guardian1_result.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/guardian1_result.txt rename to src/test/resources/com/gravity/goose/statichtml/guardian1_result.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/huffpo1.txt b/src/test/resources/com/gravity/goose/statichtml/huffpo1.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/huffpo1.txt rename to src/test/resources/com/gravity/goose/statichtml/huffpo1.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/huffpo2.txt b/src/test/resources/com/gravity/goose/statichtml/huffpo2.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/huffpo2.txt rename to src/test/resources/com/gravity/goose/statichtml/huffpo2.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/issue_24.txt b/src/test/resources/com/gravity/goose/statichtml/issue_24.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/issue_24.txt rename to src/test/resources/com/gravity/goose/statichtml/issue_24.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/issue_24_result.txt b/src/test/resources/com/gravity/goose/statichtml/issue_24_result.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/issue_24_result.txt rename to src/test/resources/com/gravity/goose/statichtml/issue_24_result.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/issue_25.txt b/src/test/resources/com/gravity/goose/statichtml/issue_25.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/issue_25.txt rename to src/test/resources/com/gravity/goose/statichtml/issue_25.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/issue_28.txt b/src/test/resources/com/gravity/goose/statichtml/issue_28.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/issue_28.txt rename to src/test/resources/com/gravity/goose/statichtml/issue_28.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/issue_32.txt b/src/test/resources/com/gravity/goose/statichtml/issue_32.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/issue_32.txt rename to src/test/resources/com/gravity/goose/statichtml/issue_32.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/msn1.txt b/src/test/resources/com/gravity/goose/statichtml/msn1.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/msn1.txt rename to src/test/resources/com/gravity/goose/statichtml/msn1.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/msn1_result.txt b/src/test/resources/com/gravity/goose/statichtml/msn1_result.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/msn1_result.txt rename to src/test/resources/com/gravity/goose/statichtml/msn1_result.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/politico1.txt b/src/test/resources/com/gravity/goose/statichtml/politico1.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/politico1.txt rename to src/test/resources/com/gravity/goose/statichtml/politico1.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/scribd1.txt b/src/test/resources/com/gravity/goose/statichtml/scribd1.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/scribd1.txt rename to src/test/resources/com/gravity/goose/statichtml/scribd1.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/techcrunch1.txt b/src/test/resources/com/gravity/goose/statichtml/techcrunch1.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/techcrunch1.txt rename to src/test/resources/com/gravity/goose/statichtml/techcrunch1.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/time1.txt b/src/test/resources/com/gravity/goose/statichtml/time1.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/time1.txt rename to src/test/resources/com/gravity/goose/statichtml/time1.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/time2.txt b/src/test/resources/com/gravity/goose/statichtml/time2.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/time2.txt rename to src/test/resources/com/gravity/goose/statichtml/time2.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/usatoday1.txt b/src/test/resources/com/gravity/goose/statichtml/usatoday1.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/usatoday1.txt rename to src/test/resources/com/gravity/goose/statichtml/usatoday1.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/wired1.txt b/src/test/resources/com/gravity/goose/statichtml/wired1.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/wired1.txt rename to src/test/resources/com/gravity/goose/statichtml/wired1.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/wsj1.txt b/src/test/resources/com/gravity/goose/statichtml/wsj1.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/wsj1.txt rename to src/test/resources/com/gravity/goose/statichtml/wsj1.txt diff --git a/src/main/resources/com/gravity/goose/statichtml/yahoo1.txt b/src/test/resources/com/gravity/goose/statichtml/yahoo1.txt similarity index 100% rename from src/main/resources/com/gravity/goose/statichtml/yahoo1.txt rename to src/test/resources/com/gravity/goose/statichtml/yahoo1.txt diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties index 38435e33f..3c922bb5e 100644 --- a/src/test/resources/log4j.properties +++ b/src/test/resources/log4j.properties @@ -1,5 +1,5 @@ # ***** Set root logger level to WARN and its two appenders to stdout and R. -log4j.rootLogger=trace, stdout +log4j.rootLogger=TRACE, stdout # ***** stdout is set to be a ConsoleAppender. log4j.appender.stdout=org.apache.log4j.ConsoleAppender @@ -19,10 +19,10 @@ log4j.appender.stdout.layout.ConversionPattern=%p %c - %m%n #set httpclient debug levels log4j.logger.org.apache.component=ERROR,stdout -log4j.logger.org.apache.http.wire=ERROR,stdout log4j.logger.org.apache.commons.httpclient=ERROR,stdout -log4j.logger.org.apache.http.client.protocol=ERROR,stdout -log4j.logger.org.apache.http=ERROR,stdout - +#log4j.logger.org.apache.http=ERROR,stdout +#log4j.logger.org.apache.http.wire=ERROR,stdout +#log4j.logger.org.apache.http.client.protocol=ERROR,stdout +#log4j.logger.org.apache.http.impl.conn=ERROR,stdout log4j.logger.net.sf.jmimemagic=WARN diff --git a/src/test/scala/com/gravity/goose/AicaiTest.scala b/src/test/scala/com/gravity/goose/AicaiTest.scala new file mode 100644 index 000000000..0532c5c7a --- /dev/null +++ b/src/test/scala/com/gravity/goose/AicaiTest.scala @@ -0,0 +1,54 @@ +package com.gravity.goose + +import org.junit.Test +import org.junit.Assert._ +import scala.io.Source +import java.io.IOException; +import java.io.StringReader; +import java.util.logging.LogManager; + +import com.chenlb.mmseg4j.ComplexSeg; +import com.chenlb.mmseg4j.Dictionary; +import com.chenlb.mmseg4j.MMSeg; +import com.chenlb.mmseg4j.Seg; +import com.chenlb.mmseg4j.Word; + +import com.gravity.goose.extractors.VoicesContentExtractor +import com.gravity.goose.text.StopWords + + +/** + * Created by Jim Plush + * User: jim + * Date: 8/16/11 + * This class hits live websites and is only run manually, not part of the tests lifecycle + */ +class AicaiTest { + + @Test + def testArticleElementedArticle() { // to verify issue #56 is resolved + var config = TestUtils.NO_IMAGE_CONFIG + config.language = Language.Chinese + //val url = "http://www.csdn.net/article/2014-04-14/2819287-what-supercell-did-next" + //val url = "http://sports.sina.com.cn/g/laliga/2014-04-14/11447118806.shtml" + val url = "http://luoxiaowei.baijia.baidu.com/article/11833" + + val html = Source.fromURL(url) + val goose = new Goose(config) + val article = goose.extractContent(url) + + println(article.cleanedArticleText) + } + + def tokenize(line: String): List[String] = { + var seg = new ComplexSeg(Dictionary.getInstance()); + var mmSeg = new MMSeg(new StringReader(line), seg); + var tokens = List[String](); + var word = mmSeg.next() + while (word != null) { + tokens = word.getString() :: tokens ; + word = mmSeg.next(); + } + return tokens; + } +} \ No newline at end of file diff --git a/src/test/scala/com/gravity/goose/AllImagesTest.scala b/src/test/scala/com/gravity/goose/AllImagesTest.scala new file mode 100644 index 000000000..71e1a7628 --- /dev/null +++ b/src/test/scala/com/gravity/goose/AllImagesTest.scala @@ -0,0 +1,39 @@ +package com.gravity.goose + +import extractors.PublishDateExtractor +import org.junit.Test +import org.junit.Assert._ +import utils.FileHelper +import java.text.SimpleDateFormat +import org.jsoup.select.Selector +import org.jsoup.nodes.Element +import java.util.Date + +/** + * Created by Francisco Vieira + * User: fvieira + * Date: 27/10/14 + */ + +class AllImagesTest { + + def getHtml(filename: String): String = { + FileHelper.loadResourceFile(TestUtils.staticHtmlDir + filename, Goose.getClass) + } + + @Test + def allImages() { + implicit val config = TestUtils.DEFAULT_CONFIG + val url = "http://blog.pkhamre.com/2012/07/24/understanding-statsd-and-graphite/" + val html = getHtml("allImages.txt") + val article = TestUtils.getArticle(url, html) + val images = "http://blog.pkhamre.com/images/irssi-conversation.png" :: + "http://blog.pkhamre.com/images/graphite-render.png" :: + "http://blog.pkhamre.com/images/graphite-registrations.png" :: + "http://blog.pkhamre.com/images/graphite-registrations-derivative.png" :: + Nil + TestUtils.runArticleAssertions(article = article, expectedImages = images) + TestUtils.printReport() + } + +} diff --git a/src/test/scala/com/gravity/goose/GoldSitesTestIT.scala b/src/test/scala/com/gravity/goose/GoldSitesTestIT.scala index 054746807..8dc80849d 100644 --- a/src/test/scala/com/gravity/goose/GoldSitesTestIT.scala +++ b/src/test/scala/com/gravity/goose/GoldSitesTestIT.scala @@ -2,7 +2,8 @@ package com.gravity.goose import org.junit.Test import org.junit.Assert._ -import com.gravity.goose.extractors.VoicesContentExtractor +import org.jsoup.nodes.Element +import com.gravity.goose.extractors.{VoicesContentExtractor, AdditionalDataExtractor} /** * Created by Jim Plush @@ -10,582 +11,73 @@ import com.gravity.goose.extractors.VoicesContentExtractor * Date: 8/16/11 * This class hits live websites and is only run manually, not part of the tests lifecycle */ -class GoldSitesTestIT { - - @Test - def testArticleElementedArticle() { // to verify issue #56 is resolved - implicit val config = TestUtils.NO_IMAGE_CONFIG - val url = "http://www.repubblica.it/economia/2012/05/12/news/giovani_anziani_asili_nido_e_soldi_per_il_sud_ecco_il_progetto_del_governo_per_l_equit-34962952/" - val content = "UN PIANO per l'equità e la crescita destinato in primo luogo al Sud. L'ha varato ieri il Consiglio dei ministri." - val title = "Giovani, anziani, asili nido e soldi per il Sud ecco il progetto del governo per l'equità " - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, expectedTitle = title, expectedStart = content) - TestUtils.printReport() - } - - @Test - def techCrunch() { - implicit val config = TestUtils.DEFAULT_CONFIG - // implicit val config = TestUtils.NO_IMAGE_CONFIG - val url = "http://techcrunch.com/2011/08/13/2005-zuckerberg-didnt-want-to-take-over-the-world/" - val content = "The Huffington Post has come across this fascinating five-minute interview" - val image = "http://tctechcrunch2011.files.wordpress.com/2011/08/screen-shot-2011-08-13-at-6-43-20-pm1.png?w=640" - val title = "2005 Zuckerberg Didn’t Want To Take Over The World" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, expectedTitle = title, expectedImage = image, expectedStart = content) - TestUtils.printReport() - } - - - @Test - def cnn() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url = "http://www.cnn.com/2010/POLITICS/08/13/democrats.social.security/index.html" - val article = TestUtils.getArticle(url) - val title = "Democrats to use Social Security against GOP this fall" - val content = "Washington (CNN) -- Democrats pledged " - val image = "http://i.cdn.turner.com/cnn/2010/POLITICS/08/13/democrats.social.security/story.kaine.gi.jpg" - TestUtils.runArticleAssertions(article = article, expectedTitle = title, expectedStart = content, expectedImage = image) - TestUtils.printReport() - } - - @Test - def cnn2() { - val url = "http://www.cnn.com/2011/POLITICS/10/06/tea.party.left/index.html?hpt=hp_t1" - implicit val config = TestUtils.DEFAULT_CONFIG - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "Washington (CNN) -- Wall Street should have seen it coming. After all, market forces were at work.", - expectedImage = "http://i.cdn.turner.com/cnn/2011/POLITICS/10/06/tea.party.left/t1larg.occupydc2.jpg") - TestUtils.printReport() - } - - @Test - def businessWeek() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://www.businessweek.com/magazine/content/10_34/b4192066630779.htm" - val article: Article = TestUtils.getArticle(url) - val title = "Olivia Munn: Queen of the Uncool" - val content = "Six years ago, Olivia Munn arrived in Hollywood with fading ambitions of making it as a sports reporter and set about deploying" - val image = "http://images.businessweek.com/mz/10/34/370/1034_mz_66popmunnessa.jpg" - TestUtils.runArticleAssertions(article = article, expectedTitle = title, expectedStart = content, expectedImage = image) - } - - - @Test - def businessWeek2() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://www.businessweek.com/magazine/content/10_34/b4192048613870.htm" - val article: Article = TestUtils.getArticle(url) - val content = "There's discord on Wall Street: Strategists at major American investment banks see a" - val image = "http://images.businessweek.com/mz/covers/current_120x160.jpg" - TestUtils.runArticleAssertions(article = article, expectedStart = content, expectedImage = image) - - } - - @Test - def businessWeek3() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://www.businessinsider.com/ben-and-jerrys-schweddy-balls-one-million-moms-american-family-association-boycott-2011-9" - val article: Article = TestUtils.getArticle(url) - // if (article == null) println("NULL ARTICLE!") else println("TEXT: \n" + article.cleanedArticleText) - val content = "Not everyone's a fan of Ben & Jerry's new \"Schweddy Balls\" -- the Saturday Night Live-inspired flavor it rolled out a few weeks ago" - val image = "http://static7.businessinsider.com/image/4e68c8c36bb3f7d80a000016/conservative-moms-are-now-calling-for-a-boycott-of-ben-and-jerrys-schweddy-balls-flavor.jpg" - TestUtils.runArticleAssertions(article = article, expectedStart = content, expectedImage = image) - - } - - @Test - def desertNews() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url = "http://www.deseretnews.com/article/705388385/High-school-basketball-Top-Utah-prospects-representing-well.html" - val article = TestUtils.getArticle(url) - val content = "Utah isn't known nationally for producing top basketball talent" - TestUtils.runArticleAssertions(article, expectedStart = content) - - } - - @Test - def foxNews() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://www.foxnews.com/politics/2010/08/14/russias-nuclear-help-iran-stirs-questions-improved-relations/" - val article = TestUtils.getArticle(url) - val content = "Russia's announcement that it will help Iran get nuclear fuel is raising questions" - val image = "http://a57.foxnews.com/static/managed/img/Politics/396/223/startsign.jpg" - TestUtils.runArticleAssertions(article = article, expectedStart = content, expectedImage = image) - - } - - @Test - def foxNews2() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://www.foxnews.com/politics/2011/10/06/obama-defends-528-million-federal-loan-to-bankrupt-solyndra/" - val article = TestUtils.getArticle(url) - val content = "The director of the controversial loan program that cleared the way for a $535" - val image = "http://a57.foxnews.com/static/managed/img/Politics/396/223/silver_jonathan.jpg" - TestUtils.runArticleAssertions(article = article, expectedStart = content, expectedImage = image) - } - - @Test - def msnbc() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://bottomline.msnbc.msn.com/_news/2011/10/06/8190264-even-without-jobs-apple-still-shines-analysts-say" - val article = TestUtils.getArticle(url) - val content = "The death of technology titan Steve Jobs, co-founder and former CEO of Apple" - val image = "http://msnbcmedia.msn.com/j/MSNBC/Components/Photo/_new/tz-biz-11106-applefuture-108p.nv_auth_landscape.jpg" - TestUtils.runArticleAssertions(article = article, expectedStart = content, expectedImage = image) - } - - - @Test - def laTimes() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://www.latimes.com/business/la-fi-jobs-legacy-hiltzik-20111006,0,5186643.column" - val article = TestUtils.getArticle(url) - val content = "Everyone knows Steve Jobs pulled off one of the outstanding corporate turnarounds in U.S. history" - val image = "http://www.latimes.com/media/photo/2011-10/65235661.jpg" - TestUtils.runArticleAssertions(article = article, expectedStart = content, expectedImage = image) - TestUtils.printReport() - - } - - @Test - def aolNews() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://www.aolnews.com/nation/article/the-few-the-proud-the-marines-getting-a-makeover/19592478" - val article = TestUtils.getArticle(url) - val content = "WASHINGTON (Aug. 13) -- Declaring \"the maritime soul of the Marine Corps\" is" - val image = "http://o.aolcdn.com/photo-hub/news_gallery/6/8/680919/1281734929876.JPEG" - TestUtils.runArticleAssertions(article = article, expectedStart = content, expectedImage = image) - } - - @Test - def wallStreetJournal() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://online.wsj.com/article/SB10001424052748704532204575397061414483040.html" - val article = TestUtils.getArticle(url) - val content = "The Obama administration has paid out less than a third of the nearly $230 billion" - val image = "http://s.wsj.net/public/resources/images/OB-JO759_0814st_A_20100814143158.jpg" - TestUtils.runArticleAssertions(article = article, expectedStart = content, expectedImage = image) - } - - @Test - def usaToday() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://content.usatoday.com/communities/thehuddle/post/2010/08/brett-favre-practices-set-to-speak-about-return-to-minnesota-vikings/1" - val article = TestUtils.getArticle(url) - val content = "Brett Favre couldn't get away from the" - val image = "http://i.usatoday.net/communitymanager/_photos/the-huddle/2010/08/18/favrespeaksx-inset-community.jpg" - TestUtils.runArticleAssertions(article = article, expectedStart = content, expectedImage = image) - } - - @Test - def usaToday2() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://content.usatoday.com/communities/driveon/post/2010/08/gm-finally-files-for-ipo/1" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "General Motors just filed with the Securities and Exchange ", - expectedImage = "http://i.usatoday.net/communitymanager/_photos/drive-on/2010/08/18/cruzex-wide-community.jpg") - } - - @Test - def usaToday3() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://www.usatoday.com/money/perfi/funds/story/2011-10-05/3q-mutual-fund-report/50674776/1" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "Timothy McIntosh, a Tampa financial planner, has always been able to soothe his customers after a rough patch in the stock market. Until now.", - expectedImage = "http://i.usatoday.net/money/_photos/2011/10/05/many-quit-stocks-is-it-time-to-buy-blen936-x.jpg") - TestUtils.printReport() - } - - - @Test - def espn() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://sports.espn.go.com/espn/commentary/news/story?id=5461430" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "If you believe what college football coaches have said about sports", - expectedImage = "http://a.espncdn.com/photo/2010/0813/pg2_g_bush3x_300.jpg") - } - - @Test - def washingtonpost() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://www.washingtonpost.com/wp-dyn/content/article/2010/12/08/AR2010120803185.html" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "The Supreme Court sounded ", - expectedImage = "http://media3.washingtonpost.com/wp-dyn/content/photo/2010/10/09/PH2010100904575.jpg") - } - - @Test - def gizmodo() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://gizmodo.com/5833746/what-if-the-earthquake-had-hit-manhattan" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "Today's 5.9 magnitude earthquake was felt throughout the Mid-Atlantic", - expectedImage = "http://cache.gizmodo.com/assets/images/4/2011/08/fb_aftershock-earthquake-in-new-york-original.jpg") - } - - @Test - def engadget() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://www.engadget.com/2010/08/18/verizon-fios-set-top-boxes-getting-a-new-hd-guide-external-stor/" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "Streaming and downloading TV content to mobiles is nice", - expectedImage = "http://www.blogcdn.com/www.engadget.com/media/2010/08/44ni600.jpg") - } - - @Test - def time() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://www.time.com/time/health/article/0,8599,2011497,00.html" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "This month, the federal government released", - expectedImage = "http://img.timeinc.net/time/daily/2010/1008/bp_oil_spill_0817.jpg") - } - - @Test - def time2() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://newsfeed.time.com/2011/08/24/washington-monument-closes-to-repair-earthquake-induced-crack/" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "Despite what the jeers of jaded Californians might suggest", - expectedImage = "http://timenewsfeed.files.wordpress.com/2011/08/newsfeed_0824.jpg?w=150") - } - - @Test - def time404() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://newsfeed.time.com/2011/08/24/washington-monument-closes-to-repair-earthquake-induced-FOO-BAR/" - val article = TestUtils.getArticle(url) - - assertNull("Article title should be null for a 404 url!", article.title) - } - - @Test - def tulsaWorld() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://www.tulsaworld.com/site/articlepath.aspx?articleid=20111118_61_A16_Opposi344152&rss_lnk=7" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "Opposition to a proposal to remove certain personal data") - - } - - - @Test - def cnet() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://news.cnet.com/8301-30686_3-20014053-266.html?tag=topStories1" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "The phone company is adding bells and whistles to", - expectedImage = "http://i.i.com.com/cnwk.1d/i/tim//2010/08/18/Verizon_iPad_and_live_TV_610x458.JPG") - } - - @Test - def wired() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://www.wired.com/epicenter/2011/10/steve-jobs-disability/" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "When I heard that Steve Jobs had passed away, I was boarding a train from New York to Philadelphia to visit my son.", - expectedImage = "http://www.wired.com/images_blogs/business/2011/10/Apple-Siri-Blind-660x375.jpg") - } - - @Test - def msn() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://lifestyle.msn.com/your-life/your-money-today/article.aspx?cp-documentid=31244150" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "\"Head to the supermarket an hour before closing time. Some stores mark down ", - expectedImage = "http://blu.stb.s-msn.com/i/6D/1235D306AF18A532BCDC8EB1CC42.jpg") - TestUtils.printReport() - } - - @Test - def ap() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://hosted2.ap.org/APDEFAULT/bbd825583c8542898e6fa7d440b9febc/Article_2011-10-06-Kids-Concussions/id-6cb44517aaec4303936fa07d5490dce6" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "(AP) — The number of athletic children going to hospitals with concussions is up 60 percent in the past decade", - expectedImage = null) - TestUtils.printReport() - } - - - @Test - def yahoo() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://news.yahoo.com/apple-says-steve-jobs-resigning-ceo-224628633.html" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "SAN FRANCISCO (AP) — Steve Jobs, the mind behind the iPhone", - expectedImage = "http://l1.yimg.com/bt/api/res/1.2/rQjGYdY_uYh6LpCnzkGFvQ--/YXBwaWQ9eW5ld3M7Zmk9ZmlsbDtoPTc1O3E9ODU7dz0xMDA-/http://media.zenfs.com/en_us/News/ap_webfeeds/89854c5c8090bd15df0e6a706700dfbc.jpg") - } - - @Test - def abcnews() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://abcnews.go.com/Technology/steve-jobs-fire-company/story?id=14683754" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "Steve Jobs was just 30 years old, wildly successful, fabulously wealthy and a global celebrity. And then it all came crashing down.", - expectedImage = "http://a.abcnews.com/images/Technology/gty_steve_jobs_port_4_dm_111006_wg.jpg") - } - - @Test - def businessInsider() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://www.businessinsider.com/closing-bell-september-20-2011-9" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "And now we're looking at two down days in a", - expectedImage = "http://static7.businessinsider.com/image/4df5d311ccd1d5591f190000/major-rally-collapses-ahead-of-huge-day-heres-what-you-need-to-know.jpg") - TestUtils.printReport() - } - - @Test - def financialTimes() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://www.ft.com/intl/cms/s/2/4e268022-e472-11e0-92a3-00144feabdc0.htm" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "Hewlett-Packard shares jumped nearly 7 per", - expectedImage = null) - TestUtils.printReport() - } - - @Test - def huffpoBusiness() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://www.huffingtonpost.com/david-macaray/labor-union-membership_b_973038.html" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "For men and women who plan on entering the job", - expectedImage = null) - TestUtils.printReport() - } - - @Test - def huffpo() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://www.huffingtonpost.com/2011/10/06/alabama-workers-immigration-law_n_997793.html" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "MONTGOMERY, Ala. -- Alabama's strict new immigration law may be backfiring.", - expectedImage = "http://i.huffpost.com/gen/369284/thumbs/s-ALABAMA-WORKERS-IMMIGRATION-LAW-large.jpg") - TestUtils.printReport() - } - - - @Test - def huffpoBusiness2() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://www.huffingtonpost.com/2011/09/21/us-sees-challenges-in-s_n_974724.html" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "WASHINGTON (Reuters) - The government is continuing an aggressive drive to hold accountable", - expectedImage = null) - TestUtils.printReport() - } - - @Test - def nyTimes1() { - implicit val config = TestUtils.DEFAULT_CONFIG - config.setBrowserUserAgent("grvGoose") - val url: String = "http://www.nytimes.com/2011/09/20/arts/design/preserving-the-american-folk-art-museums-place-in-new-york.html?_r=1&ref=arts" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "Please. Someone, everyone, do something to save the American Folk Art Museum from dissolution and dispersa", - expectedImage = "http://graphics8.nytimes.com/images/2011/09/20/arts/20folkart-web/20folkart-web-articleLarge.jpg") - TestUtils.printReport() - } - - @Test - def nyTimes2() { - implicit val config = TestUtils.DEFAULT_CONFIG - config.setBrowserUserAgent("grvGoose") - val url: String = "http://www.nytimes.com/2011/10/07/health/07prostate.html?_r=1&hp" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "Healthy men should no longer receive a P.S.A. blood test to screen for prostate cancer because the test does not save lives", - expectedImage = null) - TestUtils.printReport() - } - - - @Test - def gooseRequestParameters() { - implicit val config = TestUtils.DEFAULT_CONFIG - config.setBrowserUserAgent("grvGoose") - val url: String = "http://jimplush.com/public/uploads/goosetest.php" - val article = TestUtils.getArticle(url) - println(article.rawHtml) - - } +/** + * Updated by Marco Singer + * User: marcosinger + * Date: 4/3/12 + * This list now just soccer websites + */ +class GoldSitesTestIT { @Test - def cnbc() { - implicit val config = TestUtils.NO_IMAGE_CONFIG - - val url: String = "http://www.cnbc.com/id/44613978" - val article = TestUtils.getArticle(url) - - TestUtils.runArticleAssertions(article = article, - expectedStart = "Some traders found Wednesday's Fed statement to be a bit gloomier than expected.") - TestUtils.printReport() - } - - @Test - def cnbc3() { + def folha() { implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://www.cnbc.com//id/44608735" - val article = TestUtils.getArticle(url) - - TestUtils.runArticleAssertions(article = article, - expectedStart = "Existing home sales rose more than expected in August to the fastest annual", - expectedImage = "http://media.cnbc.com/i/CNBC/Sections/News_And_Analysis/__Story_Inserts/graphics/__REAL_ESTATE/home_sales13.jpg") - TestUtils.printReport() - } - - - // @Test - // def cnbc2() { - // // commented out while this issue is resolve: https://github.com/jhy/jsoup/issues/130 - // implicit val config = TestUtils.DEFAULT_CONFIG - // val url: String = "http://www.cnbc.com/id/44614459" - // val article = TestUtils.getArticle(url) - // println(article.cleanedArticleText) - // TestUtils.runArticleAssertions(article = article, - // expectedStart = "Some traders found Wednesday's Fed statement to be a bit gloomier than expected.", - // expectedImage = "http://media.cnbc.com/i/CNBC/Sections/News_And_Analysis/__Story_Inserts/graphics/__FEDERAL_RESERVE/FED_RESERVE3.jpg") - // TestUtils.printReport() - // } - - @Test - def yahooFinance() { - val url = "http://finance.yahoo.com/news/Mulling-Meg-Whitman-HP-apf-4116866737.html?x=0" - implicit val config = TestUtils.DEFAULT_CONFIG - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "SAN FRANCISCO (AP) -- As trial balloons go", - expectedImage = "http://chart.finance.yahoo.com/instrument/1.0/HPQ/chart;range=1d/image;size=239x110?lang=en-US®ion=US") - TestUtils.printReport() - } + val url = "http://www1.folha.uol.com.br/esporte/1070420-leao-critica-regulamento-do-paulista-e-poe-culpa-na-tv.shtml" + val content = "Após retomar a liderança do Campeonato Paulista, com a vitória do São Paulo de virada por 4 a 2 sobre o Ituano" + val image = "http://f.i.uol.com.br/folha/esporte/images/12084302.jpeg" + val title = "Leão critica regulamento do Paulista e põe culpa na TV" + val htmlContent = "

Após retomar a liderança do Campeonato Paulista, com a vitória do São Paulo de virada por 4 a 2 sobre o Ituano, o técnico Emerson Leão voltou a criticar a fórmula de disputa da competição e a FPF (Federação Paulista de Futebol), apontado a culpa para a emissora de televisão dona dos direitos de transmissão.

" + val article = TestUtils.getArticle(url) + println(article.htmlArticle) - @Test - def time3() { - val url = "http://www.time.com/time/magazine/article/0,9171,804054,00.html" - implicit val config = TestUtils.DEFAULT_CONFIG - val article = TestUtils.getArticle(url) TestUtils.runArticleAssertions(article = article, - expectedStart = "The hemline could no longer be held. With wartime controls on", - expectedImage = null) - TestUtils.printReport() + expectedStart = content, + expectedHtmlStart = htmlContent, + expectedTitle = title, + expectedImage = image) } @Test - def yahooFinance2() { - val url = "http://finance.yahoo.com/news/Stocks-plunge-after-Fed-apf-3386772167.html?x=0" + def lancenet() { implicit val config = TestUtils.DEFAULT_CONFIG - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "NEW YORK (AP) -- The Federal Reserve did what investors", - expectedImage = "http://l.yimg.com/a/p/fi/41/20/44.jpg") - TestUtils.printReport() - } - @Test - def businessinsider() { - val url = "http://www.businessinsider.com/meanwhile-developments-in-greece-2011-9" - implicit val config = TestUtils.DEFAULT_CONFIG - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "As everyone in the world was transfixed on the Fed", - expectedImage = "http://static5.businessinsider.com/image/4e77323e69beddba4c00001c/meanwhile-developments-in-greece.jpg") - TestUtils.printReport() - } - - @Test - def businessinsider2() { - val url = "http://www.businessinsider.com/goldman-on-the-fed-announcement-2011-9" - implicit val config = TestUtils.DEFAULT_CONFIG - val article = TestUtils.getArticle(url) + val url = "http://www.lancenet.com.br/sao-paulo/Leao-Arena-Barueri-casa-Tricolor_0_675532605.html" + val content = "No próximo sábado, o São Paulo jogará, como mandante, na Arena Barueri diante do Mogi Mirim" + val image = "http://www.lancenet.com.br/futebol-general/Paulo-Catanduvense-Campeonato-Paulista-Fernandinho_LANIMA20120329_0148_25.jpg" + val title = "Para Leão, Arena Barueri não é casa do Tricolor" + val htmlContent = "

No próximo sábado, o São Paulo jogará, como mandante, na Arena Barueri diante do Mogi Mirim. Isso porque no estádio do Morumbi haverá, nesta terça-feira à noite, mais um show do ex-baixista do Pink Floyd, Roger Waters. Show que prejudicará o gramado, tornando-o quase que impraticável até o fim de semana.

" + val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "From Goldman on the FOMC operation twist announcement", - expectedImage = "http://static8.businessinsider.com/image/4e7a0dd26bb3f7da4800003d/goldman-4-key-points-on-the-fomc-announcement.jpg") - TestUtils.printReport() - } + println(article.htmlArticle) - @Test - def politico() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://www.politico.com/news/stories/1010/43352.html" - val article = TestUtils.getArticle(url) TestUtils.runArticleAssertions(article = article, - expectedStart = "If the newest Census Bureau estimates stay close to form", - expectedImage = "http://images.politico.com/global/news/100927_obama22_ap_328.jpg") + expectedStart = content, + expectedHtmlStart = htmlContent, + expectedTitle = title, + expectedImage = image) } @Test - def buzznetImages() { + def globoesporte() { implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://newageamazon.buzznet.com/user/journal/17025056/doubt-gives-hope-new-album/" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "We've had so many false hopes with the new No Doubt CD.", - expectedImage = "http://img.buzznet.com/assets/imgx/2/0/8/2/2/2/1/3/orig-20822213.jpg") - } - @Test - def timeImages() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://swampland.time.com/2012/01/09/hecklers-and-hostile-crowds-stymie-santorum-in-new-hampshire/" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, - expectedStart = "It was a scene fit for a front-runner: an overflow crowd spilling out the doors of a Rockwellian", - expectedImage = "http://timeswampland.files.wordpress.com/2012/01/sl_santprotest_0109_blog.jpg?w=600&h=400&crop=1") - } + val url = "http://globoesporte.globo.com/futebol/times/sao-paulo/noticia/2012/04/filho-do-gramado-leao-administra-o-sao-paulo-na-base-da-conversa.html" + val content = "Emerson Leão não foi ao campo na manhã desta terça-feira no centro de treinamento do São Paulo" + val image = "http://s2.glbimg.com/DKjyTG2ZACkmIUmt2NcSIuR8k48J3DLeS8Txhob9fJz2lXAYXrrJq_ZurQ44i4Jn/s.glbimg.com/es/ge/f/original/2012/03/25/leao_ae_marioangelo.jpg" + val title = "'Filho do gramado', Leão administra o São Paulo na base da conversa" + val htmlContent = "

Emerson Leão não foi ao campo na manhã desta terça-feira no centro de treinamento do São Paulo. Bem humorado e com roupa casual, preferiu acompanhar de longe o trabalho físico que seus comandados fizeram na academia e no gramado. Sem a urgência de fazer qualquer ajuste, o comandante optou por trabalhar nos bastidores.

" + val article = TestUtils.getArticle(url) - @Test - def cnnMoneyImages() { - implicit val config = TestUtils.DEFAULT_CONFIG - val url: String = "http://money.cnn.com/2012/01/09/pf/suze_orman_prepaid_card/index.htm?iid=HP_LN" - val article = TestUtils.getArticle(url) TestUtils.runArticleAssertions(article = article, - expectedStart = "NEW YORK (CNNMoney) -- CNBC's outspoken financial adviser, Suze", - expectedImage = "http://i2.cdn.turner.com/money/2012/01/09/pf/suze_orman_prepaid_card/suze-orman.top.jpg") - } - - @Test - def yahooVoices() { - implicit val config = { - val myConfig = new Configuration - myConfig.enableImageFetching = false - myConfig.setContentExtractor(new VoicesContentExtractor) - myConfig - } - val url: String = "http://voices.yahoo.com/article/9330101/lovess-demise-10882501.html" - val article = TestUtils.getArticle(url) - TestUtils.runArticleAssertions(article = article, expectedTitle = "Love's Demise", - expectedStart = "Do we not love like lovers in demise? We both know our love has faded away;") + expectedStart = content, + expectedHtmlStart = htmlContent, + expectedTitle = title, + expectedImage = image) } } diff --git a/src/test/scala/com/gravity/goose/GooseTest.scala b/src/test/scala/com/gravity/goose/GooseTest.scala index 7235a34ef..83358a2db 100644 --- a/src/test/scala/com/gravity/goose/GooseTest.scala +++ b/src/test/scala/com/gravity/goose/GooseTest.scala @@ -2,9 +2,7 @@ package com.gravity.goose import org.junit.Test import org.junit.Assert._ -import scala.actors.Future -import scala.actors.Futures._ - +import java.net.UnknownHostException /** * Created by Jim Plush * User: jim @@ -16,7 +14,6 @@ class GooseTest { @Test def gooseTest() { - // implicit val config = new Configuration // val url = "http://techcrunch.com/2011/08/13/2005-zuckerberg-didnt-want-to-take-over-the-world/" // val goose = new Goose(config) @@ -39,10 +36,14 @@ class GooseTest { def badlink() { implicit val config = new Configuration val url = "http://nolove888.com/2011/08/13/LINKNOTEXISTS" + //val url = "https://developer.apple.com/library/ios/documentation/NetworkingInternet/Conceptual/RemoteNotificationsPG/Chapters/ApplePushService.html" val goose = new Goose(config) + try { val article = goose.extractContent(url) - assertNull(article.topNode) + fail() + } catch { + case e: UnknownHostException => // Expected + } } - -} \ No newline at end of file +} diff --git a/src/test/scala/com/gravity/goose/OpenGraphTest.scala b/src/test/scala/com/gravity/goose/OpenGraphTest.scala new file mode 100644 index 000000000..9fe20250e --- /dev/null +++ b/src/test/scala/com/gravity/goose/OpenGraphTest.scala @@ -0,0 +1,32 @@ +package com.gravity.goose + +import org.junit.Test +import org.junit.Assert._ + +class OpenGraphTest { + + @Test + def openGraph() { + implicit val config = TestUtils.NO_IMAGE_CONFIG + // og tags for http://www.telegraph.co.uk/foodanddrink/foodanddrinknews/8808120/Worlds-hottest-chilli-contest-leaves-two-in-hospital.html + /* + + + + + + */ + val url: String = "http://www.telegraph.co.uk/foodanddrink/foodanddrinknews/8808120/Worlds-hottest-chilli-contest-leaves-two-in-hospital.html" + val article = TestUtils.getArticle(url) + assertEquals("og:description was not as expected!", article.openGraphData.description, + "A 'world's hottest chilli' competition at a curry restaurant left two people in hospital.") + assertEquals("og:title was not as expected!", article.openGraphData.title, + "World's hottest chilli contest leaves two in hospital - Telegraph") + assertEquals("og:url was not as expected!", article.openGraphData.url, + "http://www.telegraph.co.uk/foodanddrink/foodanddrinknews/8808120/Worlds-hottest-chilli-contest-leaves-two-in-hospital.html") + assertEquals("og:image was not as expected!", article.openGraphData.image, + "http://i.telegraph.co.uk/multimedia/archive/02018/Kismot-Killer_2018476a.jpg") + assertEquals("og:type was not as expected!", article.openGraphData.ogType, + "article") + } +} \ No newline at end of file diff --git a/src/test/scala/com/gravity/goose/TestUtils.scala b/src/test/scala/com/gravity/goose/TestUtils.scala index 58fc1cb87..6059e59cf 100644 --- a/src/test/scala/com/gravity/goose/TestUtils.scala +++ b/src/test/scala/com/gravity/goose/TestUtils.scala @@ -4,6 +4,7 @@ import images.Image import junit.framework.Assert._ import com.gravity.goose.extractors.AdditionalDataExtractor import org.jsoup.nodes.Element +import scala.util.Try /** * Created by Jim Plush @@ -18,7 +19,10 @@ object TestUtils { private val TAB = "\t\t"; val articleReport = new StringBuilder("=======================::. ARTICLE REPORT .::======================\n"); - val DEFAULT_CONFIG: Configuration = new Configuration + val DEFAULT_CONFIG: Configuration = new Configuration( + localStoragePath=Try{java.io.File.createTempFile("temp", null).getParentFile().getAbsolutePath()}.getOrElse(null) + ) + //DEFAULT_CONFIG. val NO_IMAGE_CONFIG: Configuration = new Configuration NO_IMAGE_CONFIG.enableImageFetching = false @@ -44,18 +48,21 @@ object TestUtils { article } - def runArticleAssertions(article: Article, expectedTitle: String = null, expectedStart: String = null, expectedImage: String = null, expectedDescription: String = null, expectedKeywords: String = null): Unit = { - articleReport.append("URL: ").append(TAB).append(article.finalUrl).append(NL) - articleReport.append("TITLE: ").append(TAB).append(article.title).append(NL) - articleReport.append("IMAGE: ").append(TAB).append(article.topImage.getImageSrc).append(NL) - articleReport.append("IMGKIND: ").append(TAB).append(article.topImage.imageExtractionType).append(NL) - articleReport.append("CONTENT: ").append(TAB).append(article.cleanedArticleText.replace("\n", " ")).append(NL) - articleReport.append("METAKW: ").append(TAB).append(article.metaKeywords).append(NL) - articleReport.append("METADESC: ").append(TAB).append(article.metaDescription).append(NL) - articleReport.append("DOMAIN: ").append(TAB).append(article.domain).append(NL) - articleReport.append("LINKHASH: ").append(TAB).append(article.linkhash).append(NL) - articleReport.append("MOVIES: ").append(TAB).append(article.movies).append(NL) - articleReport.append("TAGS: ").append(TAB).append(article.tags).append(NL) + def runArticleAssertions(article: Article, expectedTitle: String = null, expectedStart: String = null, expectedHtmlStart: String = null, expectedImage: String = null, expectedImages: List[String] = null, expectedDescription: String = null, expectedKeywords: String = null): Unit = { + articleReport.append("URL: ").append(TAB).append(article.finalUrl).append(NL) + articleReport.append("TITLE: ").append(TAB).append(article.title).append(NL) + articleReport.append("IMAGE: ").append(TAB).append(article.topImage.getImageSrc).append(NL) + articleReport.append("All_IMGS: ").append(TAB).append(article.allImages).append(NL) + articleReport.append("IMGKIND: ").append(TAB).append(article.topImage.imageExtractionType).append(NL) + articleReport.append("ALL_IMAGES: ").append(TAB).append(article.allImages.map((i: Image) => i.getImageSrc)).append(NL) + articleReport.append("CONTENT: ").append(TAB).append(article.cleanedArticleText.replace("\n", " ")).append(NL) + articleReport.append("HTML CONTENT: ").append(TAB).append(article.htmlArticle).append(NL) + articleReport.append("METAKW: ").append(TAB).append(article.metaKeywords).append(NL) + articleReport.append("METADESC: ").append(TAB).append(article.metaDescription).append(NL) + articleReport.append("DOMAIN: ").append(TAB).append(article.domain).append(NL) + articleReport.append("LINKHASH: ").append(TAB).append(article.linkhash).append(NL) + articleReport.append("MOVIES: ").append(TAB).append(article.movies).append(NL) + articleReport.append("TAGS: ").append(TAB).append(article.tags).append(NL) assertNotNull("Resulting article was NULL!", article) @@ -71,6 +78,13 @@ object TestUtils { val actual: String = articleText.substring(0, expectedStart.length) assertEquals("The beginning of the article text was not as expected!", expectedStart, actual) } + if (expectedHtmlStart != null) { + val articleHtml: String = article.htmlArticle + assertNotNull("Resulting article html was NULL!", articleHtml) + assertTrue("Article html was not as long as expected beginning!", expectedHtmlStart.length <= articleHtml.length) + val actual: String = articleHtml.substring(0, expectedHtmlStart.length) + assertEquals("The beginning of the article html was not as expected!", expectedHtmlStart, actual) + } if (expectedImage != null) { val image: Image = article.topImage assertNotNull("Top image was NULL!", image) @@ -78,6 +92,16 @@ object TestUtils { assertNotNull("Image src was NULL!", src) assertEquals("Image src was not as expected!", expectedImage, src) } + if (expectedImages != null) { + val images: List[Image] = article.allImages + assertNotNull("Images was NULL!", images) + assertEquals("Different number of images then expected!", expectedImages.size, images.size) + images.zip(expectedImages).foreach{ case (i: Image, ei: String) => + val src: String = i.getImageSrc + assertNotNull("Image src was NULL!", src) + assertEquals("Image src was not as expected!", ei, src) + } + } if (expectedDescription != null) { val description: String = article.metaDescription assertNotNull("Meta Description was NULL!", description) @@ -93,4 +117,4 @@ object TestUtils { def printReport() { println(articleReport) } -} \ No newline at end of file +} diff --git a/src/test/scala/com/gravity/goose/TextExtractions.scala b/src/test/scala/com/gravity/goose/TextExtractionsTest.scala similarity index 94% rename from src/test/scala/com/gravity/goose/TextExtractions.scala rename to src/test/scala/com/gravity/goose/TextExtractionsTest.scala index 4840dfd99..931e06974 100644 --- a/src/test/scala/com/gravity/goose/TextExtractions.scala +++ b/src/test/scala/com/gravity/goose/TextExtractionsTest.scala @@ -15,7 +15,7 @@ import java.util.Date * Date: 8/19/11 */ -class TextExtractions { +class TextExtractionsTest { def getHtml(filename: String): String = { FileHelper.loadResourceFile(TestUtils.staticHtmlDir + filename, Goose.getClass) @@ -41,7 +41,6 @@ class TextExtractions { TestUtils.runArticleAssertions(article = article, expectedStart = "At Home Depot, we first realized we needed to have a real conversation with", expectedImage = null) - TestUtils.printReport() } @Test @@ -53,7 +52,6 @@ class TextExtractions { TestUtils.runArticleAssertions(article = article, expectedStart = "Get ready, America, because by Christmas 2012 you will have an Apple TV in your living room", expectedImage = null) - TestUtils.printReport() } @Test @@ -174,22 +172,22 @@ class TextExtractions { def wiredPubDate() { val url = "http://www.wired.com/playbook/2010/08/stress-hormones-boxing/"; val html = getHtml("wired1.txt") - val fmt = new SimpleDateFormat("yyyy-MM-dd") - + //val fmt = new SimpleDateFormat("yyyy-MM-dd") + import com.github.nscala_time.time.Imports._ + val dateParser = DateTimeFormat.forPattern("yyyy-MM-dd") // example of a custom PublishDateExtractor implicit val config = new Configuration(); config.enableImageFetching = false config.setPublishDateExtractor(new PublishDateExtractor() { @Override - def extract(rootElement: Element): Date = { + def extract(rootElement: Element): DateTime = { // look for this guy: val elements = Selector.select("meta[name=DisplayDate]", rootElement); if (elements.size() == 0) return null; val metaDisplayDate = elements.get(0); if (metaDisplayDate.hasAttr("content")) { val dateStr = metaDisplayDate.attr("content"); - - return fmt.parse(dateStr); + return dateParser.parseDateTime(dateStr); } null; } @@ -204,9 +202,7 @@ class TextExtractions { val expectedDateString = "2010-08-18"; assertNotNull("publishDate should not be null!", article.publishDate); - assertEquals("Publish date should equal: \"2010-08-18\"", expectedDateString, fmt.format(article.publishDate)); - System.out.println("Publish Date Extracted: " + fmt.format(article.publishDate)); - + assertEquals("Publish date should equal: \"2010-08-18\"", expectedDateString, dateParser.print(new DateTime(article.publishDate))); } @Test @@ -313,8 +309,6 @@ class TextExtractions { val article = TestUtils.getArticle(url, html) TestUtils.runArticleAssertions(article = article, expectedStart = "As everyone in the world was transfixed on the Fed") - - println(article.cleanedArticleText) } @Test @@ -388,5 +382,15 @@ class TextExtractions { expectedImage = null) } - -} \ No newline at end of file + @Test + def bug1() { + // html is not parsed properly + implicit val config = TestUtils.NO_IMAGE_CONFIG + val html = getHtml("bug1.html") + val url: String = "http://www.tulsaworld.com/site/articlepath.aspx?articleid=20111118_61_A16_Opposi344152&rss_lnk=7" + val article = TestUtils.getArticle(url, html) + TestUtils.runArticleAssertions(article = article, + expectedStart = "            Produsele naturale şi ecologice au devenit u", + expectedImage = null) + } +} diff --git a/src/test/scala/com/gravity/goose/images/ImageUtilsIT.scala b/src/test/scala/com/gravity/goose/images/ImageUtilsIT.scala index 4d5c203b1..f668636b2 100644 --- a/src/test/scala/com/gravity/goose/images/ImageUtilsIT.scala +++ b/src/test/scala/com/gravity/goose/images/ImageUtilsIT.scala @@ -18,7 +18,6 @@ class ImageUtilsIT { def storeImageLocally() { val httpClient: HttpClient = HtmlFetcher.getHttpClient val imgSrc = "http://tctechcrunch2011.files.wordpress.com/2011/09/aaaaa.png?w=288m" - println(ImageUtils.storeImageToLocalFile(httpClient, "abc", imgSrc, new Configuration)) } @@ -27,4 +26,4 @@ class ImageUtilsIT { val tmpFile = "/tmp/goose/abc_5dd5d54ec1e9742a09cbe9fdf7c8a4ef" // println(ImageUtils.getFileExtensionName(tmpFile, new Configuration)) } -} \ No newline at end of file +} diff --git a/src/test/scala/com/gravity/goose/text/StopWordsTest.scala b/src/test/scala/com/gravity/goose/text/StopWordsTest.scala new file mode 100644 index 000000000..4919ce8d9 --- /dev/null +++ b/src/test/scala/com/gravity/goose/text/StopWordsTest.scala @@ -0,0 +1,26 @@ +package com.gravity.goose.text + +import org.junit.Test +import org.junit.Assert._ +import scala.collection.JavaConversions._ + +/** + * Created by Jim Plush + * User: jim + * Date: 8/14/11 + */ + +class StopWordsTest { + + @Test + def findsHowManyStopWordsWeHave() { + assertEquals(0, StopWords.getStopWordCount("blah blah blah").getStopWordCount) + assertEquals(1, StopWords.getStopWordCount("although blah de blah").getStopWordCount) + } + + @Test + def determinesWhichWordsAreStopWords() { + assertEquals(seqAsJavaList(List("although")), StopWords.getStopWordCount("although blah de blah").getStopWords) + assertEquals(seqAsJavaList(List()), StopWords.getStopWordCount("blah de blah").getStopWords) + } +} \ No newline at end of file diff --git a/src/test/scala/com/gravity/goose/utils/FileHelperTest.scala b/src/test/scala/com/gravity/goose/utils/FileHelperTest.scala index 7cf13d88b..47d8b7cd2 100644 --- a/src/test/scala/com/gravity/goose/utils/FileHelperTest.scala +++ b/src/test/scala/com/gravity/goose/utils/FileHelperTest.scala @@ -14,9 +14,8 @@ class FileHelperTest { @Test def loadFileContents() { - println("loading test") val txt = FileHelper.loadResourceFile("stopwords-en.txt", StopWords.getClass) assertTrue(txt.startsWith("a's")) } -} \ No newline at end of file +} diff --git a/test-2014-10-29 16_20_15-Scala - Scala IDE.png b/test-2014-10-29 16_20_15-Scala - Scala IDE.png new file mode 100644 index 000000000..96f083c3e Binary files /dev/null and b/test-2014-10-29 16_20_15-Scala - Scala IDE.png differ