Easy-to-use Kotlin extensions for parallel document processing using coroutines.
suspend fun loadDocument() {
val source = FileSystemSource(Paths.get("path/to/document.txt"))
val document = loadAsync(source, TextDocumentParser())
println(document.text())
}
suspend fun loadDocuments() {
try {
// Get all files from directory
val paths =
Files
.walk(Paths.get("./data"))
.filter(Files::isRegularFile)
.toList()
// Process each file in parallel
val ioScope = Dispatchers.IO.limitedParallelism(8)
val documentParser = TextDocumentParser()
val documents =
paths
.map { path ->
async {
try {
loadAsync(
source = FileSystemSource(path),
parser = documentParser,
dispatcher = ioScope,
)
} catch (e: Exception) {
logger.error("Failed to load document: $path", e)
null
}
}
}.awaitAll()
.filterNotNull()
// Process loaded documents
documents.forEach { doc -> println(doc.text()) }
} catch (e: Exception) {
logger.error("Failed to process documents", e)
throw e
}
}
suspend fun parseInputStream(input: InputStream) {
val parser = TextDocumentParser()
input.use { stream -> // Automatically close stream
val document = parser.parseAsync(stream)
// Process parsed document
println(document.text())
}
}
All operations use Dispatchers.IO
for optimal I/O performance.
- Each document load operation is isolated - if one fails, others continue
- Use try-catch blocks around individual operations to handle failures gracefully
- Always close resources using
use
or try-with-resources - Log errors for failed operations while allowing successful ones to proceed
For large directories, process files in batches to control memory usage Recommended batch size: 100-1000 documents depending on size
Release document references when no longer needed Consider using sequence for large file sets: Files.walk().asSequence()
Limit parallel operations based on available CPU cores Use limitedParallelism for I/O bounds:
val ioScope = Dispatchers.IO.limitedParallelism(8)
Stream large files instead of loading into memory Consider chunking large documents
Uses Dispatchers.IO
for optimal I/O throughput.