Skip to content

Commit

Permalink
Merge pull request #23 from brianmadden/upgrade_packages
Browse files Browse the repository at this point in the history
Upgrade packages
  • Loading branch information
brianmadden authored Jan 29, 2020
2 parents 53a14af + f3a7165 commit f6edb2d
Show file tree
Hide file tree
Showing 8 changed files with 60 additions and 45 deletions.
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ repositories {
}
dependencies {
compile 'com.github.brianmadden:krawler:0.4.3'
compile 'com.github.brianmadden:krawler:0.4.4'
}
```
#### Using Maven
Expand All @@ -54,7 +54,7 @@ dependencies {
<dependency>
<groupId>com.github.brianmadden</groupId>
<artifactId>krawler</artifactId>
<version>0.4.3</version>
<version>0.4.4</version>
</dependency>
```

Expand Down Expand Up @@ -113,6 +113,11 @@ Roadmap

Release Notes
=============
**0.4.4 (2020-1-29)**
- Upgrade Kotlin to 1.3.61
- Upgrade `kotlinx.coroutines`. This required an update to some of the places where coroutine builders were called internally.
- Upgrade Gradle wrapper

**0.4.3 (2017-11-20)**
- Added ability to clear crawl queues by RequestId and Age, see `Krawler#removeUrlsByRootPage`
and `Krawler#removeUrlsByAge`
Expand Down
12 changes: 7 additions & 5 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,18 @@ group 'io.thelandscape'
version '0.4.3'

buildscript {
ext.kotlin_version = '1.1.60'
ext.kotlin_version = '1.3.61'

repositories {
maven {
url "https://plugins.gradle.org/m2/"
}
mavenCentral()
jcenter()
}
dependencies {
classpath "org.jetbrains.kotlin:kotlin-gradle-plugin:$kotlin_version"
classpath "com.github.jengelman.gradle.plugins:shadow:1.2.4"
classpath "com.github.jengelman.gradle.plugins:shadow:5.2.0"
}
}

Expand All @@ -20,8 +23,6 @@ allprojects {
apply plugin: "java"
apply plugin: "maven"

kotlin { experimental { coroutines 'enable' } }

repositories {
mavenCentral()
jcenter()
Expand All @@ -35,7 +36,7 @@ project(":") {
dependencies {
compile "org.jetbrains.kotlin:kotlin-stdlib:$kotlin_version"
compile "org.jetbrains.kotlin:kotlin-reflect:$kotlin_version"
compile 'org.jetbrains.kotlinx:kotlinx-coroutines-core:0.19.3'
compile 'org.jetbrains.kotlinx:kotlinx-coroutines-core:1.3.3'

compile "org.apache.httpcomponents:httpclient:4.5.2"
compile group: 'org.hsqldb', name: 'hsqldb', version: '2.3.4'
Expand Down Expand Up @@ -90,5 +91,6 @@ project(":example") {

dependencies {
compile project(":")

}
}
Binary file modified gradle/wrapper/gradle-wrapper.jar
Binary file not shown.
6 changes: 3 additions & 3 deletions gradle/wrapper/gradle-wrapper.properties
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#Sun May 07 21:17:57 PDT 2017
#Tue Jan 28 21:52:57 PST 2020
distributionUrl=https\://services.gradle.org/distributions/gradle-6.1.1-all.zip
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-2.13-all.zip
zipStoreBase=GRADLE_USER_HOME
26 changes: 17 additions & 9 deletions gradlew
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/usr/bin/env sh

##############################################################################
##
Expand Down Expand Up @@ -33,11 +33,11 @@ DEFAULT_JVM_OPTS=""
# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD="maximum"

warn ( ) {
warn () {
echo "$*"
}

die ( ) {
die () {
echo
echo "$*"
echo
Expand Down Expand Up @@ -154,11 +154,19 @@ if $cygwin ; then
esac
fi

# Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
function splitJvmOpts() {
JVM_OPTS=("$@")
# Escape application args
save () {
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
echo " "
}
eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
APP_ARGS=$(save "$@")

exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
# Collect all arguments for the java command, following the shell quoting and substitution rules
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"

# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
cd "$(dirname "$0")"
fi

exec "$JAVACMD" "$@"
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ import com.google.common.cache.CacheBuilder
import com.google.common.cache.CacheLoader
import com.google.common.cache.LoadingCache
import io.thelandscape.krawler.crawler.KrawlConfig
import kotlinx.coroutines.experimental.*
import kotlinx.coroutines.experimental.channels.Channel
import kotlinx.coroutines.*
import kotlinx.coroutines.channels.Channel
import org.apache.logging.log4j.LogManager
import org.apache.logging.log4j.Logger
import java.time.LocalDateTime
Expand All @@ -39,12 +39,12 @@ class ScheduledQueue(private val queues: List<KrawlQueueIf>,

init {
repeat(queues.size) {
launch(CommonPool + jobContext) {
GlobalScope.launch(Dispatchers.Default) {
pop(it)
}
}
}
}

private var pushSelector: Int = 0

private val pushAffinityCache: LoadingCache<String, Int> = CacheBuilder.newBuilder()
Expand Down Expand Up @@ -84,13 +84,13 @@ class ScheduledQueue(private val queues: List<KrawlQueueIf>,
logger.debug("Popping w/ queue selector: $index")

var entry: KrawlQueueEntry? = queues[index].pop()

while (entry == null) {
logger.debug("Delaying queue:$index for 1000...")
delay(1000)
entry = queues[index].pop()
}

krawlQueueEntryChannel.send(entry)
}
}
Expand Down
27 changes: 15 additions & 12 deletions src/main/kotlin/io/thelandscape/krawler/crawler/Krawler.kt
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,11 @@ import io.thelandscape.krawler.http.*
import io.thelandscape.krawler.robots.RoboMinder
import io.thelandscape.krawler.robots.RoboMinderIf
import io.thelandscape.krawler.robots.RobotsConfig
import kotlinx.coroutines.experimental.*
import kotlinx.coroutines.experimental.channels.*
import kotlinx.coroutines.*
import kotlinx.coroutines.channels.Channel
import kotlinx.coroutines.channels.ReceiveChannel
import kotlinx.coroutines.channels.consumeEach
import kotlinx.coroutines.channels.produce
import org.apache.logging.log4j.LogManager
import org.apache.logging.log4j.Logger
import java.time.LocalDateTime
Expand Down Expand Up @@ -239,7 +242,7 @@ abstract class Krawler(val config: KrawlConfig = KrawlConfig(),
* @param: blocking [Boolean]: (default true) whether to block until completion or immediately return
*
*/
fun start(seedUrl: List<String>, blocking: Boolean = true) = runBlocking(CommonPool) {
fun start(seedUrl: List<String>, blocking: Boolean = true) = runBlocking {
// Convert all URLs to KrawlUrls
val krawlUrls: List<KrawlUrl> = seedUrl.map { KrawlUrl.new(it) }

Expand All @@ -252,8 +255,8 @@ abstract class Krawler(val config: KrawlConfig = KrawlConfig(),
onCrawlStart()
val urls: Channel<KrawlQueueEntry> = scheduledQueue.krawlQueueEntryChannel
repeat(krawlQueues!!.size) {
launch(CommonPool + job) {
val actions: ProducerJob<KrawlAction> = produceKrawlActions(urls)
GlobalScope.launch(Dispatchers.Default) {
val actions: ReceiveChannel<KrawlAction> = produceKrawlActions(urls)
doCrawl(actions)
}
}
Expand Down Expand Up @@ -319,8 +322,8 @@ abstract class Krawler(val config: KrawlConfig = KrawlConfig(),

internal val visitCount: AtomicInteger = AtomicInteger(0)

internal suspend fun produceKrawlActions(entries: ReceiveChannel<KrawlQueueEntry>): ProducerJob<KrawlAction>
= produce(CommonPool + job) {
internal suspend fun produceKrawlActions(entries: ReceiveChannel<KrawlQueueEntry>): ReceiveChannel<KrawlAction>
= GlobalScope.produce(Dispatchers.Default) {

while(true) {
// This is where we'll die bomb out if we don't receive an entry after some time
Expand All @@ -334,7 +337,7 @@ abstract class Krawler(val config: KrawlConfig = KrawlConfig(),
}
delay(1000)
}

val (url, root, parent, depth) = entries.receive()

val krawlUrl: KrawlUrl = KrawlUrl.new(url)
Expand All @@ -349,13 +352,13 @@ abstract class Krawler(val config: KrawlConfig = KrawlConfig(),
return@produce
}
}

send(action)
}
}

internal fun fetch(krawlUrl: KrawlUrl, depth: Int, parent: KrawlUrl, rootPageId: Int): Deferred<KrawlAction>
= async(CommonPool + job) {
= GlobalScope.async(Dispatchers.Default) {

// Make sure we're within depth limit
if (depth >= config.maxDepth && config.maxDepth != -1) {
Expand Down Expand Up @@ -422,9 +425,9 @@ abstract class Krawler(val config: KrawlConfig = KrawlConfig(),
channel.consumeEach { action ->
when(action) {
is KrawlAction.Visit ->
async(CommonPool + job) { visit(action.krawlUrl, action.doc) }.await()
withContext(Dispatchers.Default) { visit(action.krawlUrl, action.doc) }
is KrawlAction.Check ->
async(CommonPool + job) { check(action.krawlUrl, action.statusCode) }.await()
withContext(Dispatchers.Default) { check(action.krawlUrl, action.statusCode) }
}
}
}
Expand Down
11 changes: 4 additions & 7 deletions src/main/kotlin/io/thelandscape/krawler/http/Requests.kt
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,8 @@ package io.thelandscape.krawler.http

import io.thelandscape.krawler.crawler.KrawlConfig
import io.thelandscape.krawler.robots.RobotsTxt
import kotlinx.coroutines.experimental.CommonPool
import kotlinx.coroutines.experimental.Deferred
import kotlinx.coroutines.experimental.async
import kotlinx.coroutines.experimental.delay
import kotlinx.coroutines.experimental.sync.Mutex
import kotlinx.coroutines.*
import kotlinx.coroutines.sync.Mutex
import org.apache.http.HttpRequest
import org.apache.http.HttpResponse
import org.apache.http.client.config.CookieSpecs
Expand Down Expand Up @@ -161,7 +158,7 @@ class Requests(private val krawlConfig: KrawlConfig,
private fun asyncMakeRequest(url: KrawlUrl,
reqFun: (String) -> HttpUriRequest,
retFun: (KrawlUrl, HttpResponse, HttpClientContext) -> RequestResponse)
: Deferred<RequestResponse> = async(CommonPool) {
: Deferred<RequestResponse> = GlobalScope.async(Dispatchers.Default) {

val httpContext = HttpClientContext()
httpContext.setAttribute("fullRedirectHistory", listOf<RedirectHistoryNode>())
Expand All @@ -178,7 +175,7 @@ class Requests(private val krawlConfig: KrawlConfig,
val reqDelta = Instant.now().toEpochMilli() - requestTracker.getTimestamp(host)
if (reqDelta >= 0 && reqDelta < krawlConfig.politenessDelay) {
// Sleep until the remainder of the politeness delay has elapsed
logger.debug("Sleeping for ${krawlConfig.politenessDelay - reqDelta} ms for politeness.")
logger.debug("Sleeping for ${krawlConfig.politenessDelay - reqDelta} ms for politeness.")
delay(krawlConfig.politenessDelay - reqDelta)
}
// Set last request time for politeness
Expand Down

0 comments on commit f6edb2d

Please sign in to comment.