From c198e4016097656d7f53e01973505af23d4f25a7 Mon Sep 17 00:00:00 2001 From: Adrien Leravat Date: Sun, 10 Nov 2024 15:33:30 -0800 Subject: [PATCH 1/9] crawler: crawl: store result in a hash map, minor improvements --- cmd/crowlet/crowlet.go | 2 +- pkg/crawler/crawl.go | 65 ++++++++++++++++-------------------------- 2 files changed, 25 insertions(+), 42 deletions(-) diff --git a/cmd/crowlet/crowlet.go b/cmd/crowlet/crowlet.go index 68d489e..4ac89c4 100644 --- a/cmd/crowlet/crowlet.go +++ b/cmd/crowlet/crowlet.go @@ -230,7 +230,7 @@ func start(c *cli.Context) error { HTTPGetter: &crawler.BaseConcurrentHTTPGetter{ Get: crawler.HTTPGet, }, - Links: crawler.CrawlLinksConfig{ + Links: crawler.CrawlPageLinksConfig{ CrawlExternalLinks: c.Bool("crawl-external"), CrawlImages: c.Bool("crawl-images"), CrawlHyperlinks: c.Bool("crawl-hyperlinks"), diff --git a/pkg/crawler/crawl.go b/pkg/crawler/crawl.go index 0be12e1..a0e28e0 100644 --- a/pkg/crawler/crawl.go +++ b/pkg/crawler/crawl.go @@ -32,12 +32,12 @@ type CrawlConfig struct { Throttle int Host string HTTP HTTPConfig - Links CrawlLinksConfig + Links CrawlPageLinksConfig HTTPGetter ConcurrentHTTPGetter } -// CrawlLinksConfig holds the crawling policy for links -type CrawlLinksConfig struct { +// CrawlPageLinksConfig holds the crawling policy for links +type CrawlPageLinksConfig struct { CrawlExternalLinks bool CrawlHyperlinks bool CrawlImages bool @@ -127,15 +127,10 @@ func AsyncCrawl(urls []string, config CrawlConfig, quit <-chan struct{}) (stats config.Links.CrawlImages results, stats, server200TimeSum := crawlUrls(urls, config, quit) - select { - case <-quit: - break - default: - if config.HTTP.ParseLinks { - _, linksStats, linksServer200TimeSum := crawlLinks(results, urls, config, quit) - stats = MergeCrawlStats(stats, linksStats) - server200TimeSum += linksServer200TimeSum - } + if config.HTTP.ParseLinks { + _, pageLinksStats, linksServer200TimeSum := crawlPageLinks(results, config, quit) + stats = MergeCrawlStats(stats, pageLinksStats) + server200TimeSum += linksServer200TimeSum } total200 := stats.StatusCodes[200] @@ -144,48 +139,41 @@ func AsyncCrawl(urls []string, config CrawlConfig, quit <-chan struct{}) (stats } if stats.Total == 0 { - err = errors.New("No URL crawled") + err = errors.New("no URL crawled") } else if stats.Total != stats.StatusCodes[200] { - err = errors.New("Some URLs had a different status code than 200") + err = errors.New("some URLs had a different status code than 200") } return } -func crawlLinks(sourceResults []HTTPResponse, sourceURLs []string, sourceConfig CrawlConfig, quit <-chan struct{}) ([]HTTPResponse, +func crawlPageLinks(sourceResults map[string]*HTTPResponse, sourceConfig CrawlConfig, quit <-chan struct{}) (map[string]*HTTPResponse, CrawlStats, time.Duration) { - linkedUrlsSet := make(map[string][]string) for _, result := range sourceResults { for _, link := range result.Links { - if link.IsExternal && !sourceConfig.Links.CrawlExternalLinks { + if (!sourceConfig.Links.CrawlExternalLinks && link.IsExternal) || + (!sourceConfig.Links.CrawlHyperlinks && link.Type == Hyperlink) || + (!sourceConfig.Links.CrawlImages && link.Type == Image) { continue } - - if link.Type == Hyperlink && !sourceConfig.Links.CrawlHyperlinks { + // Skip if already present in sourceResults + if _, ok := sourceResults[link.TargetURL.String()]; ok { continue } - - if link.Type == Image && !sourceConfig.Links.CrawlImages { - continue - } - linkedUrlsSet[link.TargetURL.String()] = append(linkedUrlsSet[link.TargetURL.String()], result.URL) } } - for _, alreadyCrawledURL := range sourceURLs { - delete(linkedUrlsSet, alreadyCrawledURL) - } - linkedUrls := make([]string, 0, len(linkedUrlsSet)) for url := range linkedUrlsSet { linkedUrls = append(linkedUrls, url) } + // Make exploration non-recursive by not collecting any more links. linksConfig := sourceConfig linksConfig.HTTP.ParseLinks = false - linksConfig.Links = CrawlLinksConfig{ + linksConfig.Links = CrawlPageLinksConfig{ CrawlExternalLinks: false, CrawlImages: false, CrawlHyperlinks: false} @@ -201,25 +189,20 @@ func crawlLinks(sourceResults []HTTPResponse, sourceURLs []string, sourceConfig return linksResults, linksStats, linksServer200TimeSum } -func crawlUrls(urls []string, config CrawlConfig, quit <-chan struct{}) (results []HTTPResponse, +func crawlUrls(urls []string, config CrawlConfig, quit <-chan struct{}) (results map[string]*HTTPResponse, stats CrawlStats, server200TimeSum time.Duration) { + results = make(map[string]*HTTPResponse) stats.StatusCodes = make(map[int]int) resultsChan := config.HTTPGetter.ConcurrentHTTPGet(urls, config.HTTP, config.Throttle, quit) - for { - select { - case result, channelOpen := <-resultsChan: - if !channelOpen { - return - } - - updateCrawlStats(result, &stats, &server200TimeSum) - results = append(results, *result) - } + for result := range resultsChan { + populateCrawlStats(result, &stats, &server200TimeSum) + results[result.URL] = result } + return } -func updateCrawlStats(result *HTTPResponse, stats *CrawlStats, total200Time *time.Duration) { +func populateCrawlStats(result *HTTPResponse, stats *CrawlStats, total200Time *time.Duration) { stats.Total++ statusCode := result.StatusCode From 28703814e49899f2f3d7b09e87e04127406737fa Mon Sep 17 00:00:00 2001 From: Adrien Leravat Date: Sun, 10 Nov 2024 15:35:42 -0800 Subject: [PATCH 2/9] crawler: http: recycle http.Client among workers --- pkg/crawler/http.go | 21 +++++++++++++-------- test/http_test.go | 3 ++- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/pkg/crawler/http.go b/pkg/crawler/http.go index 1546973..11749a2 100644 --- a/pkg/crawler/http.go +++ b/pkg/crawler/http.go @@ -33,7 +33,7 @@ type HTTPConfig struct { // HTTPGetter performs a single HTTP/S to the url, and return information // related to the result as an HTTPResponse -type HTTPGetter func(url string, config HTTPConfig) (response *HTTPResponse) +type HTTPGetter func(client *http.Client, url string, config HTTPConfig) (response *HTTPResponse) func createRequest(url string) (*http.Request, *httpstat.Result, error) { req, err := http.NewRequest("GET", url, nil) @@ -57,7 +57,7 @@ func configureRequest(req *http.Request, config HTTPConfig) { } // HTTPGet issues a GET request to a single URL and returns an HTTPResponse -func HTTPGet(urlStr string, config HTTPConfig) (response *HTTPResponse) { +func HTTPGet(client *http.Client, urlStr string, config HTTPConfig) (response *HTTPResponse) { response = &HTTPResponse{ URL: urlStr, } @@ -142,8 +142,13 @@ func (getter *BaseConcurrentHTTPGetter) ConcurrentHTTPGet(urls []string, config func RunConcurrentGet(httpGet HTTPGetter, urls []string, config HTTPConfig, maxConcurrent int, resultChan chan<- *HTTPResponse, quit <-chan struct{}) { - httpResources := make(chan int, maxConcurrent) var wg sync.WaitGroup + clientsReady := make(chan *http.Client, maxConcurrent) + for i := 0; i < maxConcurrent; i++ { + clientsReady <- &http.Client{ + Timeout: config.Timeout, + } + } defer func() { wg.Wait() @@ -155,17 +160,17 @@ func RunConcurrentGet(httpGet HTTPGetter, urls []string, config HTTPConfig, case <-quit: log.Info("Waiting for workers to finish...") return - case httpResources <- 1: + case client := <-clientsReady: wg.Add(1) - go func(url string) { + go func(client *http.Client, url string) { defer func() { - <-httpResources + clientsReady <- client wg.Done() }() - resultChan <- httpGet(url, config) - }(url) + resultChan <- httpGet(client, url, config) + }(client, url) } } } diff --git a/test/http_test.go b/test/http_test.go index b299fd4..13882d5 100644 --- a/test/http_test.go +++ b/test/http_test.go @@ -1,6 +1,7 @@ package crawler import ( + "net/http" "sort" "sync" "testing" @@ -49,7 +50,7 @@ func TestRunConcurrentGet(t *testing.T) { } } -func mockHTTPGet(url string, config crawler.HTTPConfig) *crawler.HTTPResponse { +func mockHTTPGet(client *http.Client, url string, config crawler.HTTPConfig) *crawler.HTTPResponse { fetchedUrls = append(fetchedUrls, url) waitMutex.Lock() waitMutex.Unlock() From 48d7d45d09f500a34b41ae3a54cdf0bc8a02572c Mon Sep 17 00:00:00 2001 From: Adrien Leravat Date: Sun, 10 Nov 2024 15:16:49 -0800 Subject: [PATCH 3/9] crawler: restore support for '--override-host' When '--override-host ' is provided, the host in links of the sitemap will be replaced by the one passed. This only affects links at the first level of the sitemap. Fixes #18 --- cmd/crowlet/crowlet.go | 2 +- pkg/crawler/crawl.go | 3 +++ pkg/crawler/links.go | 15 +++++++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/cmd/crowlet/crowlet.go b/cmd/crowlet/crowlet.go index 4ac89c4..a3d53c2 100644 --- a/cmd/crowlet/crowlet.go +++ b/cmd/crowlet/crowlet.go @@ -170,7 +170,7 @@ func main() { func addInterruptHandlers() chan struct{} { stop := make(chan struct{}) - osSignal := make(chan os.Signal) + osSignal := make(chan os.Signal, 1) signal.Notify(osSignal, os.Interrupt, syscall.SIGTERM) signal.Notify(osSignal, os.Interrupt, syscall.SIGINT) diff --git a/pkg/crawler/crawl.go b/pkg/crawler/crawl.go index a0e28e0..e63509b 100644 --- a/pkg/crawler/crawl.go +++ b/pkg/crawler/crawl.go @@ -122,6 +122,9 @@ func AsyncCrawl(urls []string, config CrawlConfig, quit <-chan struct{}) (stats log.Warn("Invalid throttle value, defaulting to 1.") config.Throttle = 1 } + if config.Host != "" { + urls = RewriteURLHost(urls, config.Host) + } config.HTTP.ParseLinks = config.Links.CrawlExternalLinks || config.Links.CrawlHyperlinks || config.Links.CrawlImages diff --git a/pkg/crawler/links.go b/pkg/crawler/links.go index a0e5438..f67e7f6 100644 --- a/pkg/crawler/links.go +++ b/pkg/crawler/links.go @@ -27,6 +27,21 @@ type Link struct { IsExternal bool } +// RewriteURLHost modifies a list of raw URL strings to point to a new host. +func RewriteURLHost(urls []string, newHost string) []string { + rewrittenURLs := make([]string, 0, len(urls)) + for _, rawURL := range urls { + url, err := url.Parse(rawURL) + if err != nil { + log.Error("error parsing URL:", err) + continue + } + url.Host = newHost + rewrittenURLs = append(rewrittenURLs, url.String()) + } + return rewrittenURLs +} + // ExtractLinks returns links found in the html page provided and currentURL. // The URL is used to differentiate between internal and external links func ExtractLinks(htmlBody io.ReadCloser, currentURL url.URL) ([]Link, error) { From 82422b4e3829c578eedaba1990eb2e3251d55ae8 Mon Sep 17 00:00:00 2001 From: Adrien Leravat Date: Sun, 10 Nov 2024 15:37:22 -0800 Subject: [PATCH 4/9] crawler: http: Ensure HTTP response body is closed Previously, the body could stay open in specific conditons, causing increased memory usage. --- pkg/crawler/http.go | 27 +++++++++++++-------------- pkg/crawler/links.go | 7 ++----- 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/pkg/crawler/http.go b/pkg/crawler/http.go index 11749a2..df1279f 100644 --- a/pkg/crawler/http.go +++ b/pkg/crawler/http.go @@ -2,7 +2,6 @@ package crawler import ( "io" - "io/ioutil" "net/http" "net/url" "sync" @@ -70,29 +69,28 @@ func HTTPGet(client *http.Client, urlStr string, config HTTPConfig) (response *H configureRequest(req, config) - client := http.Client{ - Timeout: config.Timeout, - } - resp, err := client.Do(req) response.EndTime = time.Now() response.Response = resp response.Result = result - if resp == nil { - response.StatusCode = 0 - } else { - response.StatusCode = response.Response.StatusCode - } - defer func() { - if resp != nil && !config.ParseLinks { - io.Copy(ioutil.Discard, resp.Body) + if resp != nil { + if !config.ParseLinks { + io.Copy(io.Discard, resp.Body) + } resp.Body.Close() } PrintResult(response) }() + if resp == nil { + response.StatusCode = 0 + } else { + response.StatusCode = response.Response.StatusCode + } + + // HTTP client error, won't trigger for 4xx or 5xx if err != nil { log.Error(err) response.Err = err @@ -102,12 +100,13 @@ func HTTPGet(client *http.Client, urlStr string, config HTTPConfig) (response *H if config.ParseLinks { currentURL, err := url.Parse(urlStr) if err != nil { + log.Error("error parsing base URL:", err) return } response.Links, err = ExtractLinks(resp.Body, *currentURL) - resp.Body.Close() if err != nil { + log.Error("error extracting page links:", err) return } } diff --git a/pkg/crawler/links.go b/pkg/crawler/links.go index f67e7f6..cf4c1ef 100644 --- a/pkg/crawler/links.go +++ b/pkg/crawler/links.go @@ -22,7 +22,6 @@ const ( // Link type holds information of URL links type Link struct { Type LinkType - Name string TargetURL url.URL IsExternal bool } @@ -109,11 +108,9 @@ func extractImageLinks(doc *goquery.Document) (links []Link) { func extractLink(urlString string) *Link { url, err := url.Parse(urlString) if err != nil { - log.Error(err) + log.Error("Failed to parse page link '", urlString, "':", err) return nil } - return &Link{ - Name: " link", //strings.TrimSpace(s.Text()), - TargetURL: *url} + return &Link{TargetURL: *url} } From 0521d4292267fadc7332fa8bf316ce717d1e4a4c Mon Sep 17 00:00:00 2001 From: Adrien Leravat Date: Sun, 10 Nov 2024 15:24:12 -0800 Subject: [PATCH 5/9] tests: move tests and add new tests --- Makefile | 2 +- {test => pkg/crawler}/crawl_test.go | 8 +- pkg/crawler/http_test.go | 210 ++++++++++++++++++++++++++++ pkg/crawler/links_test.go | 58 ++++++++ test/http_test.go | 79 ----------- 5 files changed, 272 insertions(+), 85 deletions(-) rename {test => pkg/crawler}/crawl_test.go (85%) create mode 100644 pkg/crawler/http_test.go create mode 100644 pkg/crawler/links_test.go delete mode 100644 test/http_test.go diff --git a/Makefile b/Makefile index 9d3f68f..f305875 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ build-static:: install-deps ## Builds a static binary cmd/crowlet/crowlet.go test:: ## Run tests - @cd test && go test + @cd pkg/crawler && go test install:: ## Build and install crowlet locally @cd cmd/crowlet/ && go install . diff --git a/test/crawl_test.go b/pkg/crawler/crawl_test.go similarity index 85% rename from test/crawl_test.go rename to pkg/crawler/crawl_test.go index cce4f6a..c646c8a 100644 --- a/test/crawl_test.go +++ b/pkg/crawler/crawl_test.go @@ -3,26 +3,24 @@ package crawler import ( "testing" "time" - - "github.com/Pixep/crowlet/pkg/crawler" ) func TestMergeCrawlStats(t *testing.T) { - statsA := crawler.CrawlStats{ + statsA := CrawlStats{ Total: 10, StatusCodes: map[int]int{200: 10}, Average200Time: time.Duration(1) * time.Second, Max200Time: time.Duration(2) * time.Second, } - statsB := crawler.CrawlStats{ + statsB := CrawlStats{ Total: 6, StatusCodes: map[int]int{200: 2, 404: 4}, Average200Time: time.Duration(7) * time.Second, Max200Time: time.Duration(9) * time.Second, } - stats := crawler.MergeCrawlStats(statsA, statsB) + stats := MergeCrawlStats(statsA, statsB) if stats.Total != 16 { t.Fatal("Invalid total", stats.Total) diff --git a/pkg/crawler/http_test.go b/pkg/crawler/http_test.go new file mode 100644 index 0000000..e9a5742 --- /dev/null +++ b/pkg/crawler/http_test.go @@ -0,0 +1,210 @@ +package crawler + +import ( + "net/http" + "net/http/httptest" + "net/url" + "sort" + "sync" + "testing" + "time" +) + +var waitMutex = &sync.Mutex{} +var resultMutex = &sync.Mutex{} +var fetchedUrls []string + +func TestRunConcurrentGet(t *testing.T) { + resultChan := make(chan *HTTPResponse) + quitChan := make(chan struct{}) + maxConcurrency := 3 + urls := []string{ + "url1", + "url2", + "url3", + "url4", + "url5", + } + + waitMutex.Lock() + go RunConcurrentGet(mockHTTPGet, urls, HTTPConfig{}, maxConcurrency, resultChan, quitChan) + time.Sleep(2 * time.Second) + + if len(fetchedUrls) != maxConcurrency { + t.Fatal("Incorrect channel length of", len(fetchedUrls)) + t.Fail() + } + + waitMutex.Unlock() + + resultChanOpen := true + for resultChanOpen == true { + _, resultChanOpen = <-resultChan + } + + sort.Strings(fetchedUrls) + if !testEq(fetchedUrls, urls) { + t.Fatal("Expected to crawl ", urls, " but crawled ", fetchedUrls, " instead.") + t.Fail() + } +} + +func mockHTTPGet(client *http.Client, url string, config HTTPConfig) *HTTPResponse { + resultMutex.Lock() + fetchedUrls = append(fetchedUrls, url) + resultMutex.Unlock() + waitMutex.Lock() + waitMutex.Unlock() + + return &HTTPResponse{URL: url} +} + +func testEq(a, b []string) bool { + + // If one is nil, the other must also be nil. + if (a == nil) != (b == nil) { + return false + } + + if len(a) != len(b) { + return false + } + + for i := range a { + if a[i] != b[i] { + return false + } + } + + return true +} + +// TestHTTPGet tests the HTTPGet function +func TestHTTPGet(t *testing.T) { + tests := []struct { + name string + urlStr string + config HTTPConfig + statusCode int + body string + expectedLinks []Link + doNotServe bool + }{ + { + name: "Successful GET request without link parsing", + urlStr: "/test", + config: HTTPConfig{ParseLinks: false, Timeout: 5 * time.Second}, + statusCode: http.StatusOK, + body: ``, + expectedLinks: []Link{}, // No links expected since ParseLinks is false + doNotServe: false, + }, + { + name: "Successful GET request with link parsing", + urlStr: "/test", + config: HTTPConfig{ParseLinks: true, Timeout: 5 * time.Second}, + statusCode: http.StatusOK, + body: ``, + expectedLinks: []Link{ + { + Type: Hyperlink, + TargetURL: mustParseURL("https://example.com"), + IsExternal: true, + }, + { + Type: Image, + TargetURL: mustParseURL("https://example.com/image.png"), + IsExternal: true, + }, + }, + doNotServe: false, + }, + { + name: "Server error response", + urlStr: "/error", + config: HTTPConfig{ParseLinks: false, Timeout: 5 * time.Second}, + statusCode: http.StatusInternalServerError, + body: "Internal Server Error", + expectedLinks: []Link{}, // No links expected on error + doNotServe: false, + }, + { + name: "Server error response", + urlStr: "/error", + config: HTTPConfig{ParseLinks: false, Timeout: 500 * time.Millisecond}, + statusCode: 0, + body: "", + expectedLinks: []Link{}, // No links expected on error + doNotServe: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + urlStr := "http://localhost:61111" + if !tt.doNotServe { + // Setup mock HTTP server + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(tt.statusCode) + w.Write([]byte(tt.body)) + })) + urlStr = server.URL + tt.urlStr + defer server.Close() + } + + // Initialize client with a timeout + client := &http.Client{Timeout: tt.config.Timeout} + + // Call HTTPGet function + response := HTTPGet(client, urlStr, tt.config) + + // Validate error presence as expected + if (response.Err != nil) != tt.doNotServe { + t.Errorf("expected error: %v, got: %v", tt.doNotServe, response.Err) + } + + // Validate status code + if response.StatusCode != tt.statusCode { + t.Errorf("expected status code: %d, got: %d", tt.statusCode, response.StatusCode) + } + + // Validate parsed links if ParseLinks is enabled + if tt.config.ParseLinks { + if len(response.Links) != len(tt.expectedLinks) { + t.Errorf("expected %d links, got %d", len(tt.expectedLinks), len(response.Links)) + } + for i, link := range response.Links { + expectedLink := tt.expectedLinks[i] + if link.Type != expectedLink.Type { + t.Errorf("expected link type: %v, got: %v", expectedLink.Type, link.Type) + } + if link.TargetURL.String() != expectedLink.TargetURL.String() { + t.Errorf("expected target URL: %s, got: %s", expectedLink.TargetURL.String(), link.TargetURL.String()) + } + if link.IsExternal != expectedLink.IsExternal { + t.Errorf("expected IsExternal: %v, got: %v", expectedLink.IsExternal, link.IsExternal) + } + } + } + + // Validate the Result field is set + if response.Result == nil { + t.Errorf("expected Result to be initialized, got nil") + } + + // Validate EndTime is set to a non-zero value + if response.EndTime.IsZero() { + t.Errorf("expected EndTime to be set, got zero value") + } + }) + } +} + +// Helper function to parse URLs and handle errors inline +func mustParseURL(rawurl string) url.URL { + parsedURL, err := url.Parse(rawurl) + if err != nil { + panic(err) + } + return *parsedURL +} diff --git a/pkg/crawler/links_test.go b/pkg/crawler/links_test.go new file mode 100644 index 0000000..b65fa6a --- /dev/null +++ b/pkg/crawler/links_test.go @@ -0,0 +1,58 @@ +package crawler + +import ( + "net/url" + "testing" +) + +func TestRewriteURLHost(t *testing.T) { + tests := []struct { + name string + inputURLs []string + newHost string + expectedURLs []string + }{ + { + name: "Valid URLs", + inputURLs: []string{"http://example.com/path", "https://another.com/otherpath"}, + newHost: "newhost.com", + expectedURLs: []string{"http://newhost.com/path", "https://newhost.com/otherpath"}, + }, + { + name: "Localhost", + inputURLs: []string{"https://example.com/path"}, + newHost: "localhost", + expectedURLs: []string{"https://localhost/path"}, + }, + { + name: "Invalid URL", + inputURLs: []string{"http://example.com/path", "://bad_url"}, + newHost: "newhost.com", + expectedURLs: []string{"http://newhost.com/path"}, // Only valid URL should be rewritten + }, + { + name: "Empty Input", + inputURLs: []string{}, + newHost: "newhost.com", + expectedURLs: []string{}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := RewriteURLHost(tt.inputURLs, tt.newHost) + if len(result) != len(tt.expectedURLs) { + t.Errorf("expected %d URLs, got %d", len(tt.expectedURLs), len(result)) + } + for i := range result { + expectedURL, _ := url.Parse(tt.expectedURLs[i]) + resultURL, _ := url.Parse(result[i]) + if resultURL.Scheme != expectedURL.Scheme || + resultURL.Host != expectedURL.Host || + resultURL.Path != expectedURL.Path { + t.Errorf("expected URL %s, got %s", tt.expectedURLs[i], result[i]) + } + } + }) + } +} diff --git a/test/http_test.go b/test/http_test.go deleted file mode 100644 index 13882d5..0000000 --- a/test/http_test.go +++ /dev/null @@ -1,79 +0,0 @@ -package crawler - -import ( - "net/http" - "sort" - "sync" - "testing" - "time" - - "github.com/Pixep/crowlet/pkg/crawler" -) - -var waitMutex = &sync.Mutex{} -var fetchedUrls []string - -func TestRunConcurrentGet(t *testing.T) { - resultChan := make(chan *crawler.HTTPResponse) - quitChan := make(chan struct{}) - maxConcurrency := 3 - urls := []string{ - "url1", - "url2", - "url3", - "url4", - "url5", - } - - waitMutex.Lock() - go crawler.RunConcurrentGet(mockHTTPGet, urls, crawler.HTTPConfig{}, maxConcurrency, resultChan, quitChan) - time.Sleep(2 * time.Second) - - if len(fetchedUrls) != maxConcurrency { - t.Fatal("Incorrect channel length of", len(fetchedUrls)) - t.Fail() - } - - waitMutex.Unlock() - - resultChanOpen := true - for resultChanOpen == true { - select { - case _, resultChanOpen = <-resultChan: - } - } - - sort.Strings(fetchedUrls) - if !testEq(fetchedUrls, urls) { - t.Fatal("Expected to crawl ", urls, " but crawled ", fetchedUrls, " instead.") - t.Fail() - } -} - -func mockHTTPGet(client *http.Client, url string, config crawler.HTTPConfig) *crawler.HTTPResponse { - fetchedUrls = append(fetchedUrls, url) - waitMutex.Lock() - waitMutex.Unlock() - - return &crawler.HTTPResponse{URL: url} -} - -func testEq(a, b []string) bool { - - // If one is nil, the other must also be nil. - if (a == nil) != (b == nil) { - return false - } - - if len(a) != len(b) { - return false - } - - for i := range a { - if a[i] != b[i] { - return false - } - } - - return true -} From 8af6b2d7b79e44df19659a7d422a336102de4d44 Mon Sep 17 00:00:00 2001 From: Adrien Leravat Date: Sun, 10 Nov 2024 16:20:18 -0800 Subject: [PATCH 6/9] Dockerfile: bump to golang:1.23 --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index c80850a..08b1fbf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1.19-alpine as builder +FROM golang:1.23-alpine as builder RUN apk add --update --no-cache git gcc musl-dev make @@ -10,7 +10,7 @@ RUN make build-static \ && mkdir -p /opt/bin \ && mv ./crowlet /opt/bin/crowlet -FROM golang:1.19-alpine +FROM golang:1.23-alpine COPY --from=builder /opt/bin/crowlet /opt/bin/crowlet From e1e4085d4ba3ddede9864b35d3e3ee7216570587 Mon Sep 17 00:00:00 2001 From: Adrien Leravat Date: Fri, 15 Nov 2024 20:16:05 -0800 Subject: [PATCH 7/9] tests: add benchmarking code --- Makefile | 3 + cmd/crowlet/crowlet_test.go | 110 ++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 cmd/crowlet/crowlet_test.go diff --git a/Makefile b/Makefile index f305875..98a03a9 100644 --- a/Makefile +++ b/Makefile @@ -22,6 +22,9 @@ build-static:: install-deps ## Builds a static binary test:: ## Run tests @cd pkg/crawler && go test +benchmark:: ## Run benchmarks + @cd cmd/crowlet && go test -bench=. -benchtime=30x -benchmem + install:: ## Build and install crowlet locally @cd cmd/crowlet/ && go install . diff --git a/cmd/crowlet/crowlet_test.go b/cmd/crowlet/crowlet_test.go new file mode 100644 index 0000000..8b7b8b7 --- /dev/null +++ b/cmd/crowlet/crowlet_test.go @@ -0,0 +1,110 @@ +package main + +import ( + "flag" + "fmt" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/urfave/cli" +) + +func BuildSitemap(host string, count int) string { + sitemap := `{toto} + ` + for i := 1; i <= count; i++ { + if i%10 == 0 { + sitemap += fmt.Sprintf("%s/error%d", host, i) + } else { + sitemap += fmt.Sprintf("%s/page%d", host, i) + } + } + sitemap += "" + return sitemap +} + +func BuildPageContent(size int) string { + content := ` + + + + + Test Page + + +

Test Page

+

This is a test page with repeated content to reach approximately 2KB in size.

+` + paragraph := `

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus lacinia odio vitae vestibulum vestibulum. Cras venenatis euismod malesuada.

` + + // Append paragraphs until the content reaches the desired size + for len(content) < size { + content += paragraph + } + content += ` + +` + return content +} + +func BenchmarkStartFunction(b *testing.B) { + pageContent := BuildPageContent(150000) + var sitemapXML string + + // Set up a mock HTTP server to simulate sitemap and page responses + mockServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == "/sitemap.xml" { + w.Header().Set("Content-Type", "application/xml") + w.WriteHeader(http.StatusOK) + w.Write([]byte(sitemapXML)) + } else if strings.HasPrefix(r.URL.Path, "/page") { + w.WriteHeader(http.StatusOK) + w.Write([]byte(pageContent)) + } else { + w.WriteHeader(http.StatusNotFound) + } + })) + sitemapXML = BuildSitemap(mockServer.URL, 1000) + + defer mockServer.Close() + + // Step 2: Set up CLI flags and arguments for the context + app := cli.NewApp() + set := flag.NewFlagSet("test", 0) + set.Int("throttle", 3, "number of http requests to do at once") + set.String("override-host", "", "override hostname") + set.String("user", "", "username for basic auth") + set.String("pass", "", "password for basic auth") + set.Int("timeout", 1000, "timeout in milliseconds") + set.Bool("crawl-external", false, "crawl external links") + set.Bool("crawl-images", false, "crawl images") + set.Bool("crawl-hyperlinks", true, "crawl hyperlinks") + set.Int("iterations", 1, "number of crawl iterations") + set.Bool("forever", false, "crawl forever") + set.Int("wait-interval", 0, "wait interval between iterations") + set.Bool("quiet", true, "suppress output") + set.Bool("json", false, "json output") + set.Int("non-200-error", 1, "error code for non-200 responses") + set.Int("response-time-error", 2, "error code for max response time exceeded") + set.Int("response-time-max", 0, "max response time in milliseconds") + set.Bool("summary-only", false, "only print summary") + + // Add sitemap URL as the argument + set.Parse([]string{mockServer.URL + "/sitemap.xml"}) + + // Create context with flags and args + ctx := cli.NewContext(app, set, nil) + + // Start the benchmark test + b.ResetTimer() // Reset the timer to measure only the time spent in the loop + b.ReportAllocs() // Report memory allocations per operation + + for i := 0; i < b.N; i++ { + err := start(ctx) + if err != nil { + b.Fatalf("start function failed: %v", err) + } + } +} From 399aa956df38d0277f02b5b83095604ad4a752fb Mon Sep 17 00:00:00 2001 From: Adrien Leravat Date: Fri, 15 Nov 2024 20:22:54 -0800 Subject: [PATCH 8/9] Makefile: push both version and latest of docker image --- Dockerfile | 2 +- Makefile | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 08b1fbf..6973e86 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1.23-alpine as builder +FROM golang:1.23-alpine AS builder RUN apk add --update --no-cache git gcc musl-dev make diff --git a/Makefile b/Makefile index 98a03a9..fadb01c 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,8 @@ IMAGE_NAME = crowlet -IMAGE_VERSION = latest +IMAGE_VERSION = 0.3.0 IMAGE_ORG = aleravat IMAGE_TAG = $(IMAGE_ORG)/$(IMAGE_NAME):$(IMAGE_VERSION) +IMAGE_TAG_LATEST = $(IMAGE_ORG)/$(IMAGE_NAME):latest .DEFAULT_GOAL := build @@ -41,7 +42,9 @@ docker-build:: ## Builds the docker image docker-push:: ## Pushes the docker image to the registry @echo Pushing $(IMAGE_TAG) + @docker image tag $(IMAGE_TAG) $(IMAGE_TAG_LATEST) @docker push $(IMAGE_TAG) + @docker push $(IMAGE_TAG_LATEST) docker-release:: docker-build docker-push ## Builds and pushes the docker image to the registry From 38c825085d40f5366d5403e0a8fdead4035d4afa Mon Sep 17 00:00:00 2001 From: Adrien Leravat Date: Fri, 15 Nov 2024 20:41:12 -0800 Subject: [PATCH 9/9] crowlet: bump version for CLI --- cmd/crowlet/crowlet.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/crowlet/crowlet.go b/cmd/crowlet/crowlet.go index a3d53c2..b020c16 100644 --- a/cmd/crowlet/crowlet.go +++ b/cmd/crowlet/crowlet.go @@ -15,7 +15,7 @@ import ( var ( // VERSION stores the current version as string - VERSION = "v0.2.1" + VERSION = "v0.3.0" ) func beforeApp(c *cli.Context) error {