Skip to content

Commit

Permalink
Writes JSON file as it scraps. Can connect to an Elastisearch server …
Browse files Browse the repository at this point in the history
…on localhost
  • Loading branch information
ricardoaat committed May 17, 2018
1 parent b61ae3f commit 66a3fea
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 19 deletions.
27 changes: 22 additions & 5 deletions Main.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"net/url"
"os"
"strings"

"github.com/ricardoaat/bioschemas-gocrawlit/crawler"
"github.com/rifflock/lfshook"
Expand Down Expand Up @@ -43,11 +44,12 @@ func logInit(d bool) {

func main() {

e := flag.Bool("e", false, "Connects to an elastisearch server on http://127.0.0.1:9200")
d := flag.Bool("d", false, "Sets up the log level to debug")
v := flag.Bool("v", false, "Returns the binary version and built date info")
q := flag.Bool("q", false, "Skip queries on the URL.")
u := flag.String("u", "", "Url to crawl and extract markup")
m := flag.Int("maxdepth", 0, "Max number of recursion depth of visited URLs")
m := flag.Int("m", 0, "Max number of recursion depth of visited URLs")
p := flag.Bool("p", false, "Stay on current path.")
qr := flag.String("query", "", "Pagination query word")

Expand All @@ -71,6 +73,11 @@ func main() {
log.Error("Error parsing URL ", err)
}

nq := baseURL
nq.RawQuery = ""

f := fmt.Sprintf("%s%sschema.json", baseURL.Host, strings.Replace(baseURL.Path, "/", "_", -1))

filter := ""
if *p {
filter = fmt.Sprintf(`^%s://%s%s`, baseURL.Scheme, baseURL.Host, baseURL.Path)
Expand All @@ -81,6 +88,9 @@ func main() {
ad = append(ad, fmt.Sprintf("www.%s", baseURL.Host))

c := crawler.Crawler{
UseElastic: *e,
Index: baseURL.Host,
OutputFileName: f,
BaseURL: baseURL,
SkipQueries: *q,
MaxDepth: *m,
Expand All @@ -91,11 +101,18 @@ func main() {

c.Init()

if *e {
if err := c.ElasticInit(); err != nil {
log.Error("Error initializing elastic function ")
}
}

c.Start()

err = c.ToJSONfile()
if err != nil {
log.Error("ToJSONfile error ", err)
}
// err = c.ToJSONfile()
// if err != nil {
// log.Error("ToJSONfile error ", err)
// }

}
}
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@ Scraped data will be stored in a json file named ```<website_host>_schema.json``

- **-p**: Stay on current path. i.e. When crawling a page like ```https://www.ebi.ac.uk/biosamples/samples``` and don't want it to crawl the whole website, e.g. ```https://www.ebi.ac.uk```.
- **-m**: Max number of recursion depth of visited URLs. Default infinity recursion. (The crawler does not revisit URLs)
- **-e**: Adds crawled data to an Elasticsearch (v6) service at http://127.0.0.1:9200.
- **-u**: Start page to start crawling.
- **-q**: Remove query section from the link URL found.
- **--query**: Use with **-q** so it follows only links that contain the query word provided, e.g., ```./bioschemas-gocrawlit -u https://tess.elixir-europe.org/events -q --page page```
- **-h**: Print Help and exit.


Expand All @@ -48,6 +50,6 @@ The binaries would be placed under build/ path.
- [x] Better file output
- [x] Sitemap.xml Crawl option
- [x] Pagination option
- [ ] Conecting to a flexible storage
- [x] Conecting to a flexible storage
- [ ] RDFa extraction support
- [ ] Writing file as it scraps
- [x] Writing file as it scraps
129 changes: 117 additions & 12 deletions crawler/crawler.go
Original file line number Diff line number Diff line change
@@ -1,38 +1,53 @@
package crawler

import (
"context"
"encoding/json"
"fmt"
"io/ioutil"
"net/url"
"os"
"regexp"
"strings"

"github.com/gocolly/colly"
"github.com/olivere/elastic"
log "github.com/sirupsen/logrus"
)

type pageData struct {
Page string `json:"page"`
Metadata string `json:"data"`
Page string `json:"page"`
Metadata map[string]interface{} `json:"data"`
}

// Crawler structure calls collys crawler and
// its configured to extract microdata and JSON-LD metadata.
type Crawler struct {
Index string
C *colly.Collector
BaseURL *url.URL
UseElastic bool
SkipQueries bool
MaxDepth int
AllowedDomains []string
PagesData []pageData
Filter string
QueryWord string
ElasticClient *elastic.Client
OutputFileName string
Client *elastic.Client
OutFile *os.File
}

// Init setup the initial configuration for the crawler
// based on the parameter given when the crawler instance is created.
func (cw *Crawler) Init() {
f, err := os.Create(cw.OutputFileName)
if err != nil {
log.Error("Error opening file ", f)
}
cw.OutFile = f

cacheDir := fmt.Sprintf("bioschemas_gocrawlit_cache/%s_cache", cw.BaseURL.Host)

cw.C = colly.NewCollector(
Expand Down Expand Up @@ -68,7 +83,17 @@ func (cw *Crawler) Init() {
log.Warn("Script found ", e.Request.URL)
log.Debug(e.Text)

cw.PagesData = append(cw.PagesData, pageData{e.Request.URL.String(), e.Text})
var res map[string]interface{}
if err := json.Unmarshal([]byte(e.Text), &res); err != nil {
log.Error("Error getting MAP from microdata json result ")
}
pageData := pageData{e.Request.URL.String(), res}

if cw.UseElastic {
cw.sendToElastic(pageData)
}

cw.sendToJSONfile(pageData)
})

cw.C.OnHTML(`html`, func(e *colly.HTMLElement) {
Expand All @@ -81,13 +106,19 @@ func (cw *Crawler) Init() {
log.Error("Error getting HTML")
}

json, err := extractMicrodata(html, cw.BaseURL)

res, err := extractMicrodata(html, cw.BaseURL)
if err != nil {
log.Error("Error calling extractMicrodata ", err)
return
}
cw.PagesData = append(cw.PagesData, pageData{e.Request.URL.String(), string(json)})

pageData := pageData{e.Request.URL.String(), res}

if cw.UseElastic {
cw.sendToElastic(pageData)
}

cw.sendToJSONfile(pageData)
}

//time.Sleep(1 * time.Second)
Expand Down Expand Up @@ -141,26 +172,100 @@ func (cw *Crawler) Init() {
// Start visits the url given as entry point starting
// starting the crawling process.
func (cw *Crawler) Start() {
defer cw.OutFile.Close()
cw.C.Visit(cw.BaseURL.String())
}

func extractMicrodata(html string, baseURL *url.URL) ([]byte, error) {
var json []byte
func (cw *Crawler) sendToElastic(p pageData) {
ctx := context.Background()
data := p.Metadata
data["page"] = p.Page
_, err := cw.Client.Index().Index(cw.Index).Type("page").BodyJson(data).Do(ctx)
if err != nil {
log.Panic("Error indexig ", p.Page)
}
}

func (cw *Crawler) sendToJSONfile(p pageData) {
data := p.Metadata
data["page"] = p.Page

j, err := json.Marshal(data)
if err != nil {
log.Error("Error at marshalling lemap to json ", err)
}

_, err = cw.OutFile.WriteString(string(j) + "\n")
if err != nil {
log.Error("Error writing output file line ", err)
}
}

// ElasticInit sets up the initial configuration for the
// crawler's elastic interface
func (cw *Crawler) ElasticInit() error {

ctx := context.Background()

c, err := elastic.NewClient(
elastic.SetSniff(false),
)
if err != nil {
log.Panic("Error creating elastic client ", err)
return err
}

cw.Client = c

inf, code, err := cw.Client.Ping("http://127.0.0.1:9200").Do(ctx)
if err != nil {
log.Panic("Error Pinging elastic client ", err)
return err
}
log.Info(fmt.Sprintf("Elasticsearch returned with code %d and version %s\n", code, inf.Version.Number))

ex, err := cw.Client.IndexExists(cw.Index).Do(ctx)
if err != nil {
log.Panic("Error fetchin index existence ", err)
return err
}

if !ex {
in, err := cw.Client.CreateIndex(cw.Index).Do(ctx)
log.Info("Creating index " + cw.Index)
if err != nil {
log.Panic("Error creating index ", err)
}

if !in.Acknowledged {
log.Error("Index creation not acknowledged")
}
}

return nil
}

func extractMicrodata(html string, baseURL *url.URL) (map[string]interface{}, error) {
var res map[string]interface{}

p := NewParser(strings.NewReader(html), baseURL)
data, err := p.Parse()
if err != nil {
log.Error("Error parsing microdata from HTML ", html)
return json, err
return res, err
}

json, err = data.JSON()
r, err := data.JSON()
if err != nil {
log.Error("Error getting JSON from microdata HTML ")
return json, err
return res, err
}

return json, nil
if err := json.Unmarshal(r, &res); err != nil {
log.Error("Error getting MAP from microdata json result ")
return res, err
}
return res, nil

}

Expand Down

0 comments on commit 66a3fea

Please sign in to comment.