Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Various updates towards v0.3.0 #23

Merged
merged 9 commits into from
Nov 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM golang:1.19-alpine as builder
FROM golang:1.23-alpine AS builder

RUN apk add --update --no-cache git gcc musl-dev make

Expand All @@ -10,7 +10,7 @@ RUN make build-static \
&& mkdir -p /opt/bin \
&& mv ./crowlet /opt/bin/crowlet

FROM golang:1.19-alpine
FROM golang:1.23-alpine

COPY --from=builder /opt/bin/crowlet /opt/bin/crowlet

Expand Down
10 changes: 8 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
IMAGE_NAME = crowlet
IMAGE_VERSION = latest
IMAGE_VERSION = 0.3.0
IMAGE_ORG = aleravat
IMAGE_TAG = $(IMAGE_ORG)/$(IMAGE_NAME):$(IMAGE_VERSION)
IMAGE_TAG_LATEST = $(IMAGE_ORG)/$(IMAGE_NAME):latest

.DEFAULT_GOAL := build

Expand All @@ -20,7 +21,10 @@ build-static:: install-deps ## Builds a static binary
cmd/crowlet/crowlet.go

test:: ## Run tests
@cd test && go test
@cd pkg/crawler && go test

benchmark:: ## Run benchmarks
@cd cmd/crowlet && go test -bench=. -benchtime=30x -benchmem

install:: ## Build and install crowlet locally
@cd cmd/crowlet/ && go install .
Expand All @@ -38,7 +42,9 @@ docker-build:: ## Builds the docker image

docker-push:: ## Pushes the docker image to the registry
@echo Pushing $(IMAGE_TAG)
@docker image tag $(IMAGE_TAG) $(IMAGE_TAG_LATEST)
@docker push $(IMAGE_TAG)
@docker push $(IMAGE_TAG_LATEST)

docker-release:: docker-build docker-push ## Builds and pushes the docker image to the registry

Expand Down
6 changes: 3 additions & 3 deletions cmd/crowlet/crowlet.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import (

var (
// VERSION stores the current version as string
VERSION = "v0.2.1"
VERSION = "v0.3.0"
)

func beforeApp(c *cli.Context) error {
Expand Down Expand Up @@ -170,7 +170,7 @@ func main() {

func addInterruptHandlers() chan struct{} {
stop := make(chan struct{})
osSignal := make(chan os.Signal)
osSignal := make(chan os.Signal, 1)
signal.Notify(osSignal, os.Interrupt, syscall.SIGTERM)
signal.Notify(osSignal, os.Interrupt, syscall.SIGINT)

Expand Down Expand Up @@ -230,7 +230,7 @@ func start(c *cli.Context) error {
HTTPGetter: &crawler.BaseConcurrentHTTPGetter{
Get: crawler.HTTPGet,
},
Links: crawler.CrawlLinksConfig{
Links: crawler.CrawlPageLinksConfig{
CrawlExternalLinks: c.Bool("crawl-external"),
CrawlImages: c.Bool("crawl-images"),
CrawlHyperlinks: c.Bool("crawl-hyperlinks"),
Expand Down
110 changes: 110 additions & 0 deletions cmd/crowlet/crowlet_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
package main

import (
"flag"
"fmt"
"net/http"
"net/http/httptest"
"strings"
"testing"

"github.com/urfave/cli"
)

func BuildSitemap(host string, count int) string {
sitemap := `<?xml version="1.0" encoding="UTF-8"?>{toto}
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">`
for i := 1; i <= count; i++ {
if i%10 == 0 {
sitemap += fmt.Sprintf("<url><loc>%s/error%d</loc></url>", host, i)
} else {
sitemap += fmt.Sprintf("<url><loc>%s/page%d</loc></url>", host, i)
}
}
sitemap += "</urlset>"
return sitemap
}

func BuildPageContent(size int) string {
content := `
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Test Page</title>
</head>
<body>
<h1>Test Page</h1>
<p>This is a test page with repeated content to reach approximately 2KB in size.</p>
`
paragraph := `<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus lacinia odio vitae vestibulum vestibulum. Cras venenatis euismod malesuada.</p>`

// Append paragraphs until the content reaches the desired size
for len(content) < size {
content += paragraph
}
content += `
</body>
</html>`
return content
}

func BenchmarkStartFunction(b *testing.B) {
pageContent := BuildPageContent(150000)
var sitemapXML string

// Set up a mock HTTP server to simulate sitemap and page responses
mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/sitemap.xml" {
w.Header().Set("Content-Type", "application/xml")
w.WriteHeader(http.StatusOK)
w.Write([]byte(sitemapXML))
} else if strings.HasPrefix(r.URL.Path, "/page") {
w.WriteHeader(http.StatusOK)
w.Write([]byte(pageContent))
} else {
w.WriteHeader(http.StatusNotFound)
}
}))
sitemapXML = BuildSitemap(mockServer.URL, 1000)

defer mockServer.Close()

// Step 2: Set up CLI flags and arguments for the context
app := cli.NewApp()
set := flag.NewFlagSet("test", 0)
set.Int("throttle", 3, "number of http requests to do at once")
set.String("override-host", "", "override hostname")
set.String("user", "", "username for basic auth")
set.String("pass", "", "password for basic auth")
set.Int("timeout", 1000, "timeout in milliseconds")
set.Bool("crawl-external", false, "crawl external links")
set.Bool("crawl-images", false, "crawl images")
set.Bool("crawl-hyperlinks", true, "crawl hyperlinks")
set.Int("iterations", 1, "number of crawl iterations")
set.Bool("forever", false, "crawl forever")
set.Int("wait-interval", 0, "wait interval between iterations")
set.Bool("quiet", true, "suppress output")
set.Bool("json", false, "json output")
set.Int("non-200-error", 1, "error code for non-200 responses")
set.Int("response-time-error", 2, "error code for max response time exceeded")
set.Int("response-time-max", 0, "max response time in milliseconds")
set.Bool("summary-only", false, "only print summary")

// Add sitemap URL as the argument
set.Parse([]string{mockServer.URL + "/sitemap.xml"})

// Create context with flags and args
ctx := cli.NewContext(app, set, nil)

// Start the benchmark test
b.ResetTimer() // Reset the timer to measure only the time spent in the loop
b.ReportAllocs() // Report memory allocations per operation

for i := 0; i < b.N; i++ {
err := start(ctx)
if err != nil {
b.Fatalf("start function failed: %v", err)
}
}
}
68 changes: 27 additions & 41 deletions pkg/crawler/crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@ type CrawlConfig struct {
Throttle int
Host string
HTTP HTTPConfig
Links CrawlLinksConfig
Links CrawlPageLinksConfig
HTTPGetter ConcurrentHTTPGetter
}

// CrawlLinksConfig holds the crawling policy for links
type CrawlLinksConfig struct {
// CrawlPageLinksConfig holds the crawling policy for links
type CrawlPageLinksConfig struct {
CrawlExternalLinks bool
CrawlHyperlinks bool
CrawlImages bool
Expand Down Expand Up @@ -122,20 +122,18 @@ func AsyncCrawl(urls []string, config CrawlConfig, quit <-chan struct{}) (stats
log.Warn("Invalid throttle value, defaulting to 1.")
config.Throttle = 1
}
if config.Host != "" {
urls = RewriteURLHost(urls, config.Host)
}

config.HTTP.ParseLinks = config.Links.CrawlExternalLinks || config.Links.CrawlHyperlinks ||
config.Links.CrawlImages
results, stats, server200TimeSum := crawlUrls(urls, config, quit)

select {
case <-quit:
break
default:
if config.HTTP.ParseLinks {
_, linksStats, linksServer200TimeSum := crawlLinks(results, urls, config, quit)
stats = MergeCrawlStats(stats, linksStats)
server200TimeSum += linksServer200TimeSum
}
if config.HTTP.ParseLinks {
_, pageLinksStats, linksServer200TimeSum := crawlPageLinks(results, config, quit)
stats = MergeCrawlStats(stats, pageLinksStats)
server200TimeSum += linksServer200TimeSum
}

total200 := stats.StatusCodes[200]
Expand All @@ -144,48 +142,41 @@ func AsyncCrawl(urls []string, config CrawlConfig, quit <-chan struct{}) (stats
}

if stats.Total == 0 {
err = errors.New("No URL crawled")
err = errors.New("no URL crawled")
} else if stats.Total != stats.StatusCodes[200] {
err = errors.New("Some URLs had a different status code than 200")
err = errors.New("some URLs had a different status code than 200")
}

return
}

func crawlLinks(sourceResults []HTTPResponse, sourceURLs []string, sourceConfig CrawlConfig, quit <-chan struct{}) ([]HTTPResponse,
func crawlPageLinks(sourceResults map[string]*HTTPResponse, sourceConfig CrawlConfig, quit <-chan struct{}) (map[string]*HTTPResponse,
CrawlStats, time.Duration) {

linkedUrlsSet := make(map[string][]string)
for _, result := range sourceResults {
for _, link := range result.Links {
if link.IsExternal && !sourceConfig.Links.CrawlExternalLinks {
if (!sourceConfig.Links.CrawlExternalLinks && link.IsExternal) ||
(!sourceConfig.Links.CrawlHyperlinks && link.Type == Hyperlink) ||
(!sourceConfig.Links.CrawlImages && link.Type == Image) {
continue
}

if link.Type == Hyperlink && !sourceConfig.Links.CrawlHyperlinks {
continue
}

if link.Type == Image && !sourceConfig.Links.CrawlImages {
// Skip if already present in sourceResults
if _, ok := sourceResults[link.TargetURL.String()]; ok {
continue
}

linkedUrlsSet[link.TargetURL.String()] = append(linkedUrlsSet[link.TargetURL.String()], result.URL)
}
}

for _, alreadyCrawledURL := range sourceURLs {
delete(linkedUrlsSet, alreadyCrawledURL)
}

linkedUrls := make([]string, 0, len(linkedUrlsSet))
for url := range linkedUrlsSet {
linkedUrls = append(linkedUrls, url)
}

// Make exploration non-recursive by not collecting any more links.
linksConfig := sourceConfig
linksConfig.HTTP.ParseLinks = false
linksConfig.Links = CrawlLinksConfig{
linksConfig.Links = CrawlPageLinksConfig{
CrawlExternalLinks: false,
CrawlImages: false,
CrawlHyperlinks: false}
Expand All @@ -201,25 +192,20 @@ func crawlLinks(sourceResults []HTTPResponse, sourceURLs []string, sourceConfig
return linksResults, linksStats, linksServer200TimeSum
}

func crawlUrls(urls []string, config CrawlConfig, quit <-chan struct{}) (results []HTTPResponse,
func crawlUrls(urls []string, config CrawlConfig, quit <-chan struct{}) (results map[string]*HTTPResponse,
stats CrawlStats, server200TimeSum time.Duration) {

results = make(map[string]*HTTPResponse)
stats.StatusCodes = make(map[int]int)
resultsChan := config.HTTPGetter.ConcurrentHTTPGet(urls, config.HTTP, config.Throttle, quit)
for {
select {
case result, channelOpen := <-resultsChan:
if !channelOpen {
return
}

updateCrawlStats(result, &stats, &server200TimeSum)
results = append(results, *result)
}
for result := range resultsChan {
populateCrawlStats(result, &stats, &server200TimeSum)
results[result.URL] = result
}
return
}

func updateCrawlStats(result *HTTPResponse, stats *CrawlStats, total200Time *time.Duration) {
func populateCrawlStats(result *HTTPResponse, stats *CrawlStats, total200Time *time.Duration) {
stats.Total++

statusCode := result.StatusCode
Expand Down
8 changes: 3 additions & 5 deletions test/crawl_test.go → pkg/crawler/crawl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,24 @@ package crawler
import (
"testing"
"time"

"github.com/Pixep/crowlet/pkg/crawler"
)

func TestMergeCrawlStats(t *testing.T) {
statsA := crawler.CrawlStats{
statsA := CrawlStats{
Total: 10,
StatusCodes: map[int]int{200: 10},
Average200Time: time.Duration(1) * time.Second,
Max200Time: time.Duration(2) * time.Second,
}

statsB := crawler.CrawlStats{
statsB := CrawlStats{
Total: 6,
StatusCodes: map[int]int{200: 2, 404: 4},
Average200Time: time.Duration(7) * time.Second,
Max200Time: time.Duration(9) * time.Second,
}

stats := crawler.MergeCrawlStats(statsA, statsB)
stats := MergeCrawlStats(statsA, statsB)

if stats.Total != 16 {
t.Fatal("Invalid total", stats.Total)
Expand Down
Loading
Loading