Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

爬到的东西为空 #30

Open
LLMApple opened this issue Dec 21, 2023 · 1 comment
Open

爬到的东西为空 #30

LLMApple opened this issue Dec 21, 2023 · 1 comment

Comments

@LLMApple
Copy link

像爬取的text为空,还有就是添加三元组的时候attrs和values也是空的,所以加不到三元组里

@LLMApple
Copy link
Author

LLMApple commented Dec 21, 2023

我改了一下代码,能跑起来了:
`# -- coding: utf-8 --
import scrapy
import logging
import urllib
import os
import glob
import re
import pymongo
from scrapy.selector import Selector
from neo4j import GraphDatabase
import logging
import time
logfile_name = time.ctime(time.time()).replace(' ', '_')
if not os.path.exists('logs\'):
os.mkdir('logs\')
logfile_name = time.strftime('%d-%b-%y %H-%M-%S', time.localtime())
log_file_path = os.path.join('logs', f'{logfile_name}.log')

logging.basicConfig(filename=log_file_path, filemode='a+',
format='%(levelname)s - %(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')

class BaikeSpider(scrapy.Spider):
name = 'baike'
allowed_domains = ['baike.baidu.com']
start_urls = ['https://baike.baidu.com/item/文汇报']
db = pymongo.MongoClient("mongodb://127.0.0.1:27017/")["db_kg"]
db_baike = db['db_baike']
db_triples = db['db_triples']
olds = set([item['_id'] for item in db_baike.find({}, {'_id': 1})])
if len(olds) > 0:
start_urls = ['https://baike.baidu.com/item/'+olds.pop()]

uri = "bolt://localhost:7687"
user = "neo4j"  # 替换成您的用户名
password = "123"  # 替换成您的密码
driver = GraphDatabase.driver(uri, auth=(user, password), encrypted=False)

def add_node(self, tx, name1, relation, name2):
    tx.run("MERGE (a:Node {name: $name1}) "
           "MERGE (b:Node {name: $name2}) "
           "MERGE (a)-[:"+relation+"]-> (b)",
           name1=name1, name2=name2)

    print("Nodes and relationship added to Neo4j.")

def parse(self, response):
    # print(response.url)
    item_name = re.sub('/', '', re.sub('https://baike.baidu.com/item/',
                                       '', urllib.parse.unquote(response.url)))
    # 爬取过的直接忽视
    if item_name in self.olds:
        return
    # 将网页内容存入mongodb
    try:
        text = ''.join(response.xpath('//div[@class="main-content"]').xpath('//div[@class="para"]//text()').getall())
        if not text:
            text = item_name
        self.db_baike.insert_one(
            {
                '_id': item_name,
                'text': text
            })
    except pymongo.errors.DuplicateKeyError:
        pass
    # 更新爬取过的item集合
    self.olds.add(item_name)
    # 爬取页面内的item
    items = set(response.xpath(
        '//a[contains(@href, "/item/")]/@href').re(r'/item/[A-Za-z0-9%\u4E00-\u9FA5]+'))
    for item in items:
        new_url = 'https://baike.baidu.com'+urllib.parse.unquote(item)
        new_item_name = re.sub(
            '/', '', re.sub('https://baike.baidu.com/item/', '', new_url))
        if new_item_name not in self.olds:
            yield response.follow(new_url, callback=self.parse)

    # 处理三元组
    entity = ''.join(response.xpath(
        '//h1/text()').getall()).replace('/', '')
    attrs = response.xpath('//dt[@class="basicInfoItem_Ql5xB itemName_bc1nm"]/text()').getall()
    values = response.xpath('//dd[@class="basicInfoItem_Ql5xB itemValue_Kzb4E"]//text()').getall()

    if len(attrs) != len(values):
        return
    with self.driver.session() as session:
        try:
            for attr, value in zip(attrs, values):
                if attr == "" or value == "":
                    continue

                # attr
                temp = Selector(text=attr).xpath(
                    '//dt//text()').getall()
                relation_type = re.sub(r'\s+', '', attr)
                value = re.sub(r'\s+', '', value)
                print(relation_type)
                # value
                # value = ''.join(Selector(text=value).xpath(
                #     '//dd/text()|//dd/a//text()').getall())
                try:
                    value = value.replace('\n', '')
                    logging.warning(entity+'_'+attr+'_'+value)
                    self.db_triples.insert_one({
                        "_id": entity+'_'+attr+'_'+value,
                        "item_name": entity,
                        "attr": attr,
                        "value": value, }
                    )
                except pymongo.errors.DuplicateKeyError:
                    pass

                # print(f"session: {session}")
                session.write_transaction(
                    self.add_node, entity, relation_type, value)
        except Exception as e:
            print(e)
            logging.error('\n---'.join(attrs) +
                          '\n_________________'+'\n---'.join(values))

`

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant