Writes JSON file as it scraps. Can connect to an Elastisearch server …

…on localhost
ricardoaat · May 17, 2018 · 66a3fea · 66a3fea
1 parent b61ae3f
commit 66a3fea
Show file tree

Hide file tree

Showing 3 changed files with 143 additions and 19 deletions.
diff --git a/Main.go b/Main.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"net/url"
 	"os"
+	"strings"
 
 	"github.com/ricardoaat/bioschemas-gocrawlit/crawler"
 	"github.com/rifflock/lfshook"
@@ -43,11 +44,12 @@ func logInit(d bool) {
 
 func main() {
 
+	e := flag.Bool("e", false, "Connects to an elastisearch server on http://127.0.0.1:9200")
 	d := flag.Bool("d", false, "Sets up the log level to debug")
 	v := flag.Bool("v", false, "Returns the binary version and built date info")
 	q := flag.Bool("q", false, "Skip queries on the URL.")
 	u := flag.String("u", "", "Url to crawl and extract markup")
-	m := flag.Int("maxdepth", 0, "Max number of recursion depth of visited URLs")
+	m := flag.Int("m", 0, "Max number of recursion depth of visited URLs")
 	p := flag.Bool("p", false, "Stay on current path.")
 	qr := flag.String("query", "", "Pagination query word")
 
@@ -71,6 +73,11 @@ func main() {
 			log.Error("Error parsing URL ", err)
 		}
 
+		nq := baseURL
+		nq.RawQuery = ""
+
+		f := fmt.Sprintf("%s%sschema.json", baseURL.Host, strings.Replace(baseURL.Path, "/", "_", -1))
+
 		filter := ""
 		if *p {
 			filter = fmt.Sprintf(`^%s://%s%s`, baseURL.Scheme, baseURL.Host, baseURL.Path)
@@ -81,6 +88,9 @@ func main() {
 		ad = append(ad, fmt.Sprintf("www.%s", baseURL.Host))
 
 		c := crawler.Crawler{
+			UseElastic:     *e,
+			Index:          baseURL.Host,
+			OutputFileName: f,
 			BaseURL:        baseURL,
 			SkipQueries:    *q,
 			MaxDepth:       *m,
@@ -91,11 +101,18 @@ func main() {
 
 		c.Init()
 
+		if *e {
+			if err := c.ElasticInit(); err != nil {
+				log.Error("Error initializing elastic function ")
+			}
+		}
+
 		c.Start()
 
-		err = c.ToJSONfile()
-		if err != nil {
-			log.Error("ToJSONfile error ", err)
-		}
+		// err = c.ToJSONfile()
+		// if err != nil {
+		// 	log.Error("ToJSONfile error ", err)
+		// }
+
 	}
 }
diff --git a/README.md b/README.md
@@ -23,8 +23,10 @@ Scraped data will be stored in a json file named ```<website_host>_schema.json``
 
 - **-p**: Stay on current path. i.e. When crawling a page like ```https://www.ebi.ac.uk/biosamples/samples``` and don't want it to crawl the whole website, e.g. ```https://www.ebi.ac.uk```.
 - **-m**: Max number of recursion depth of visited URLs. Default infinity recursion. (The crawler does not revisit URLs)
+- **-e**: Adds crawled data to an Elasticsearch (v6) service at http://127.0.0.1:9200.
 - **-u**: Start page to start crawling.
 - **-q**: Remove query section from the link URL found.
+- **--query**: Use with **-q** so it follows only links that contain the query word provided, e.g., ```./bioschemas-gocrawlit -u https://tess.elixir-europe.org/events -q --page page```
 - **-h**: Print Help and exit.
 
 
@@ -48,6 +50,6 @@ The binaries would be placed under build/ path.
 - [x] Better file output
 - [x] Sitemap.xml Crawl option
 - [x] Pagination option
-- [ ] Conecting to a flexible storage
+- [x] Conecting to a flexible storage
 - [ ] RDFa extraction support
-- [ ] Writing file as it scraps
+- [x] Writing file as it scraps
diff --git a/crawler/crawler.go b/crawler/crawler.go
@@ -1,38 +1,53 @@
 package crawler
 
 import (
+	"context"
 	"encoding/json"
 	"fmt"
 	"io/ioutil"
 	"net/url"
+	"os"
 	"regexp"
 	"strings"
 
 	"github.com/gocolly/colly"
+	"github.com/olivere/elastic"
 	log "github.com/sirupsen/logrus"
 )
 
 type pageData struct {
-	Page     string `json:"page"`
-	Metadata string `json:"data"`
+	Page     string                 `json:"page"`
+	Metadata map[string]interface{} `json:"data"`
 }
 
 // Crawler structure calls collys crawler and
 //	its configured to extract microdata and JSON-LD metadata.
 type Crawler struct {
+	Index          string
 	C              *colly.Collector
 	BaseURL        *url.URL
+	UseElastic     bool
 	SkipQueries    bool
 	MaxDepth       int
 	AllowedDomains []string
 	PagesData      []pageData
 	Filter         string
 	QueryWord      string
+	ElasticClient  *elastic.Client
+	OutputFileName string
+	Client         *elastic.Client
+	OutFile        *os.File
 }
 
 // Init setup the initial configuration for the crawler
 // based on the parameter given when the crawler instance is created.
 func (cw *Crawler) Init() {
+	f, err := os.Create(cw.OutputFileName)
+	if err != nil {
+		log.Error("Error opening file ", f)
+	}
+	cw.OutFile = f
+
 	cacheDir := fmt.Sprintf("bioschemas_gocrawlit_cache/%s_cache", cw.BaseURL.Host)
 
 	cw.C = colly.NewCollector(
@@ -68,7 +83,17 @@ func (cw *Crawler) Init() {
 		log.Warn("Script found ", e.Request.URL)
 		log.Debug(e.Text)
 
-		cw.PagesData = append(cw.PagesData, pageData{e.Request.URL.String(), e.Text})
+		var res map[string]interface{}
+		if err := json.Unmarshal([]byte(e.Text), &res); err != nil {
+			log.Error("Error getting MAP from microdata json result ")
+		}
+		pageData := pageData{e.Request.URL.String(), res}
+
+		if cw.UseElastic {
+			cw.sendToElastic(pageData)
+		}
+
+		cw.sendToJSONfile(pageData)
 	})
 
 	cw.C.OnHTML(`html`, func(e *colly.HTMLElement) {
@@ -81,13 +106,19 @@ func (cw *Crawler) Init() {
 				log.Error("Error getting HTML")
 			}
 
-			json, err := extractMicrodata(html, cw.BaseURL)
-
+			res, err := extractMicrodata(html, cw.BaseURL)
 			if err != nil {
 				log.Error("Error calling extractMicrodata ", err)
 				return
 			}
-			cw.PagesData = append(cw.PagesData, pageData{e.Request.URL.String(), string(json)})
+
+			pageData := pageData{e.Request.URL.String(), res}
+
+			if cw.UseElastic {
+				cw.sendToElastic(pageData)
+			}
+
+			cw.sendToJSONfile(pageData)
 		}
 
 		//time.Sleep(1 * time.Second)
@@ -141,26 +172,100 @@ func (cw *Crawler) Init() {
 // Start visits the url given as entry point starting
 // starting the crawling process.
 func (cw *Crawler) Start() {
+	defer cw.OutFile.Close()
 	cw.C.Visit(cw.BaseURL.String())
 }
 
-func extractMicrodata(html string, baseURL *url.URL) ([]byte, error) {
-	var json []byte
+func (cw *Crawler) sendToElastic(p pageData) {
+	ctx := context.Background()
+	data := p.Metadata
+	data["page"] = p.Page
+	_, err := cw.Client.Index().Index(cw.Index).Type("page").BodyJson(data).Do(ctx)
+	if err != nil {
+		log.Panic("Error indexig ", p.Page)
+	}
+}
+
+func (cw *Crawler) sendToJSONfile(p pageData) {
+	data := p.Metadata
+	data["page"] = p.Page
+
+	j, err := json.Marshal(data)
+	if err != nil {
+		log.Error("Error at marshalling lemap to json ", err)
+	}
+
+	_, err = cw.OutFile.WriteString(string(j) + "\n")
+	if err != nil {
+		log.Error("Error writing output file line ", err)
+	}
+}
+
+// ElasticInit sets up the initial configuration for the
+//	crawler's elastic interface
+func (cw *Crawler) ElasticInit() error {
+
+	ctx := context.Background()
+
+	c, err := elastic.NewClient(
+		elastic.SetSniff(false),
+	)
+	if err != nil {
+		log.Panic("Error creating elastic client ", err)
+		return err
+	}
+
+	cw.Client = c
+
+	inf, code, err := cw.Client.Ping("http://127.0.0.1:9200").Do(ctx)
+	if err != nil {
+		log.Panic("Error Pinging elastic client ", err)
+		return err
+	}
+	log.Info(fmt.Sprintf("Elasticsearch returned with code %d and version %s\n", code, inf.Version.Number))
+
+	ex, err := cw.Client.IndexExists(cw.Index).Do(ctx)
+	if err != nil {
+		log.Panic("Error fetchin index existence ", err)
+		return err
+	}
+
+	if !ex {
+		in, err := cw.Client.CreateIndex(cw.Index).Do(ctx)
+		log.Info("Creating index " + cw.Index)
+		if err != nil {
+			log.Panic("Error creating index  ", err)
+		}
+
+		if !in.Acknowledged {
+			log.Error("Index creation not acknowledged")
+		}
+	}
+
+	return nil
+}
+
+func extractMicrodata(html string, baseURL *url.URL) (map[string]interface{}, error) {
+	var res map[string]interface{}
 
 	p := NewParser(strings.NewReader(html), baseURL)
 	data, err := p.Parse()
 	if err != nil {
 		log.Error("Error parsing microdata from HTML ", html)
-		return json, err
+		return res, err
 	}
 
-	json, err = data.JSON()
+	r, err := data.JSON()
 	if err != nil {
 		log.Error("Error getting JSON from microdata HTML ")
-		return json, err
+		return res, err
 	}
 
-	return json, nil
+	if err := json.Unmarshal(r, &res); err != nil {
+		log.Error("Error getting MAP from microdata json result ")
+		return res, err
+	}
+	return res, nil
 
 }