diff --git a/data_extractor/code/esg_data_pipeline/Dockerfile b/data_extractor/code/esg_data_pipeline/Dockerfile index bd762ff..d01e5f9 100644 --- a/data_extractor/code/esg_data_pipeline/Dockerfile +++ b/data_extractor/code/esg_data_pipeline/Dockerfile @@ -1,22 +1,11 @@ -# Modified mmdetection Dockerfile -ARG PYTORCH="1.4" -ARG CUDA="10.1" -ARG CUDNN="7" +FROM ubuntu:22.04 +SHELL ["/bin/bash", "-c"] -FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel - -ENV TORCH_CUDA_ARCH_LIST="3.7 6.0 6.1 7.0+PTX" -ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all" -ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" - -RUN apt-key del 3bf863cc -RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub - -RUN apt-key del 7fa2af80 -RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub +# no prompt during installation: +ARG DEBIAN_FRONTEND=noninteractive # Added poppler-utils, default-jre installations -RUN apt-get update && apt-get install -y git wget vim ninja-build poppler-utils default-jre \ +RUN apt-get update && apt-get install -y apt-utils git wget python3 python3-pip vim ninja-build poppler-utils default-jre \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* @@ -31,9 +20,9 @@ WORKDIR /app/code/esg_data_pipeline RUN pip install -e . RUN mkdir -p /app/server_logs -RUN chmod -R 777 /app/server_logs - RUN mkdir -p /app/data + +RUN chmod -R 777 /app/server_logs RUN chmod -R 777 /app/data CMD ./entry.sh diff --git a/data_extractor/code/esg_data_pipeline/esg_data_pipeline/__init__.py b/data_extractor/code/esg_data_pipeline/esg_data_pipeline/__init__.py index 8d0a76a..01ccdc5 100644 --- a/data_extractor/code/esg_data_pipeline/esg_data_pipeline/__init__.py +++ b/data_extractor/code/esg_data_pipeline/esg_data_pipeline/__init__.py @@ -1,6 +1,5 @@ -from .components import Extractor, PDFTextExtractor, \ - TextCurator - +from .components import Extractor, PDFTextExtractor, TextCurator +from .extraction_server import run_extraction import logging from .config import logging_config, config diff --git a/data_extractor/code/esg_data_pipeline/esg_data_pipeline/components/base_curator.py b/data_extractor/code/esg_data_pipeline/esg_data_pipeline/components/base_curator.py index 1dd03a3..6a1e29e 100644 --- a/data_extractor/code/esg_data_pipeline/esg_data_pipeline/components/base_curator.py +++ b/data_extractor/code/esg_data_pipeline/esg_data_pipeline/components/base_curator.py @@ -1,4 +1,3 @@ -import re from abc import abstractmethod import re diff --git a/data_extractor/code/esg_data_pipeline/esg_data_pipeline/components/pdf_text_extractor.py b/data_extractor/code/esg_data_pipeline/esg_data_pipeline/components/pdf_text_extractor.py index 7098696..1210959 100644 --- a/data_extractor/code/esg_data_pipeline/esg_data_pipeline/components/pdf_text_extractor.py +++ b/data_extractor/code/esg_data_pipeline/esg_data_pipeline/components/pdf_text_extractor.py @@ -3,7 +3,6 @@ import json import logging import os -import re from pathlib import Path import pandas as pd @@ -80,25 +79,26 @@ def extract_pdf_by_page(self, pdf_file): _logger.warning("{}: Unable to process {}".format(e, pdf_file)) return {} - fp = open(pdf_file, 'rb') + pdf_content = {} + + # Create a PDF resource manager rsrcmgr = PDFResourceManager() - retstr = io.BytesIO() - codec = 'utf-8' laparams = LAParams() - device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) - interpreter = PDFPageInterpreter(rsrcmgr, device) - - pdf_content = {} - for page_number, page in enumerate(PDFPage.get_pages(fp, check_extractable=False)): - interpreter.process_page(page) - data = retstr.getvalue().decode('utf-8') - data_paragraphs = self.process_page(data) - if len(data_paragraphs) == 0: - continue - pdf_content[page_number] = data_paragraphs - retstr.truncate(0) - retstr.seek(0) - fp.close() + retstr = io.StringIO() + + with open(pdf_file, 'rb') as fp: + # Create a PDF page interpreter + device = TextConverter(rsrcmgr, retstr, laparams=laparams) + interpreter = PDFPageInterpreter(rsrcmgr, device) + for page_number, page in enumerate(PDFPage.get_pages(fp)): + interpreter.process_page(page) + data = retstr.getvalue() + data_paragraphs = self.process_page(data) + if len(data_paragraphs) == 0: + continue + pdf_content[page_number] = data_paragraphs + retstr.truncate(0) + retstr.seek(0) return pdf_content diff --git a/data_extractor/code/esg_data_pipeline/esg_data_pipeline/components/text_curator.py b/data_extractor/code/esg_data_pipeline/esg_data_pipeline/components/text_curator.py index 8c45b93..1cac3aa 100644 --- a/data_extractor/code/esg_data_pipeline/esg_data_pipeline/components/text_curator.py +++ b/data_extractor/code/esg_data_pipeline/esg_data_pipeline/components/text_curator.py @@ -9,7 +9,7 @@ import pandas as pd -import esg_data_pipeline.utils.kpi_mapping as kpi_mapping +import data_extractor.code.utils.kpi_mapping as kpi_mapping from .base_curator import BaseCurator logger = logging.getLogger(__name__) diff --git a/data_extractor/code/esg_data_pipeline/esg_data_pipeline/config/__init__.py b/data_extractor/code/esg_data_pipeline/esg_data_pipeline/config/__init__.py index 139597f..8b13789 100644 --- a/data_extractor/code/esg_data_pipeline/esg_data_pipeline/config/__init__.py +++ b/data_extractor/code/esg_data_pipeline/esg_data_pipeline/config/__init__.py @@ -1,2 +1 @@ - diff --git a/data_extractor/code/esg_data_pipeline/esg_data_pipeline/config/config.py b/data_extractor/code/esg_data_pipeline/esg_data_pipeline/config/config.py index f49c5c4..f370c72 100644 --- a/data_extractor/code/esg_data_pipeline/esg_data_pipeline/config/config.py +++ b/data_extractor/code/esg_data_pipeline/esg_data_pipeline/config/config.py @@ -1,4 +1,3 @@ -import os import pathlib # General config @@ -9,15 +8,16 @@ ROOT = CONFIG_FOLDER.parent.parent.parent.parent DATA_FOLDER = ROOT / "data" -#Extraction inputs + +# Extraction inputs PDFTextExtractor_kwargs = {'min_paragraph_length': 30, - #Set to ANNOTATION_FOLDER if you want to extract just pdfs mentioned in the annotations - #Set to None to extract all pdfs in pdf folder (for production stage) + # Set to ANNOTATION_FOLDER if you want to extract just pdfs mentioned in the annotations + # Set to None to extract all pdfs in pdf folder (for production stage) 'annotation_folder': None, 'skip_extracted_files': False } -#Curation inputs +# Curation inputs TextCurator_kwargs = { 'retrieve_paragraph': False, 'neg_pos_ratio': 1, diff --git a/data_extractor/code/esg_data_pipeline/esg_data_pipeline/extraction_server.py b/data_extractor/code/esg_data_pipeline/esg_data_pipeline/extraction_server.py index d29c2d0..fc77e6c 100644 --- a/data_extractor/code/esg_data_pipeline/esg_data_pipeline/extraction_server.py +++ b/data_extractor/code/esg_data_pipeline/esg_data_pipeline/extraction_server.py @@ -7,11 +7,12 @@ from flask import Flask, Response, request import shutil import traceback -from s3_communication import S3Communication +from data_extractor.code.utils.s3_communication import S3Communication + +from .components import Extractor +from .config import config +from .components import Curator -from esg_data_pipeline.components import Extractor -from esg_data_pipeline.config import config -from esg_data_pipeline.components import Curator app = Flask(__name__) @@ -33,21 +34,35 @@ def liveness(): @app.route('/extract/') def run_extraction(): - args = json.loads(request.args['payload']) + # args = json.loads(request.args['payload']) + + # Local args dictionary + args = {"project_name": 'TEST'} + args.update({"s3_usage": False}) + extraction_settings = {} + extraction_settings.update({"use_extractions": False}) + extraction_settings.update({"seed": 42}) + extraction_settings.update({"min_paragraph_length": 20}) + extraction_settings.update({"annotation_folder": None}) + extraction_settings.update({"skip_extracted_files": False}) + extraction_settings.update({"store_extractions": True}) + args.update({"extraction": extraction_settings}) + + # Load the extraction settings from args project_name = args["project_name"] - extraction_settings = args['extraction'] - - BASE_DATA_PROJECT_FOLDER = config.DATA_FOLDER / project_name - config.PDF_FOLDER = BASE_DATA_PROJECT_FOLDER / 'interim' / 'pdfs' - BASE_INTERIM_FOLDER = BASE_DATA_PROJECT_FOLDER / 'interim' / 'ml' - config.EXTRACTION_FOLDER = BASE_INTERIM_FOLDER / 'extraction' - config.ANNOTATION_FOLDER = BASE_INTERIM_FOLDER / 'annotations' - config.STAGE = 'extract' - - create_directory(config.EXTRACTION_FOLDER) - create_directory(config.ANNOTATION_FOLDER) - create_directory(config.PDF_FOLDER) + + # Create path strings for the different folders + base_data_project_folder = config.DATA_FOLDER / project_name + pdf_folder = base_data_project_folder / 'interim' / 'pdfs' + base_interim_folder = base_data_project_folder / 'interim' / 'ml' + extraction_folder = base_interim_folder / 'extraction' + annotation_folder = base_interim_folder / 'annotations' + + # Create the folders if they do not already exist + os.makedirs(extraction_folder, exist_ok=True) + os.makedirs(annotation_folder, exist_ok=True) + os.makedirs(pdf_folder, exist_ok=True) s3_usage = args["s3_usage"] if s3_usage: @@ -68,48 +83,53 @@ def run_extraction(): ) if extraction_settings['use_extractions']: s3c_main.download_files_in_prefix_to_dir(project_prefix + '/output/TEXT_EXTRACTION', - config.EXTRACTION_FOLDER) + extraction_folder) s3c_interim.download_files_in_prefix_to_dir(project_prefix + '/interim/ml/annotations', - config.ANNOTATION_FOLDER) + annotation_folder) if args['mode'] == 'train': s3c_main.download_files_in_prefix_to_dir(project_prefix + '/input/pdfs/training', - config.PDF_FOLDER) + pdf_folder) else: s3c_main.download_files_in_prefix_to_dir(project_prefix + '/input/pdfs/inference', - config.PDF_FOLDER) + pdf_folder) - pdfs = glob.glob(os.path.join(config.PDF_FOLDER, "*.pdf")) + pdfs = glob.glob(os.path.join(pdf_folder, "*.pdf")) if len(pdfs) == 0: - msg = "No pdf files found in the pdf directory ({})".format(config.PDF_FOLDER) + msg = "No pdf files found in the pdf directory ({})".format(pdf_folder) return Response(msg, status=500) - - annotation_files = glob.glob(os.path.join(config.ANNOTATION_FOLDER, "*.csv")) + + """ + # TODO Why do we need annotation at all? Actually extraction does not need that! + annotation_files = glob.glob(os.path.join(annotation_folder, "*.csv")) if len(annotation_files) == 0: msg = "No annotations.csv file found on S3." return Response(msg, status=500) elif len(annotation_files) > 2: msg = "Multiple annotations.csv files found on S3." return Response(msg, status=500) - + """ + + # Update the settings in config to the user settings + config.STAGE = 'extract' config.SEED = extraction_settings["seed"] config.PDFTextExtractor_kwargs['min_paragraph_length'] = extraction_settings["min_paragraph_length"] config.PDFTextExtractor_kwargs['annotation_folder'] = extraction_settings["annotation_folder"] config.PDFTextExtractor_kwargs['skip_extracted_files'] = extraction_settings["skip_extracted_files"] + # Create an extractor class element with the newly updated extraction settings ext = Extractor(config.EXTRACTORS) - try: t1 = time.time() - ext.run_folder(config.PDF_FOLDER, config.EXTRACTION_FOLDER) + ext.run_folder(pdf_folder, extraction_folder) t2 = time.time() except Exception as e: msg = "Error during extraction\nException:" + str(e) return Response(msg, status=500) - extracted_files = os.listdir(config.EXTRACTION_FOLDER) + extracted_files = os.listdir(extraction_folder) if len(extracted_files) == 0: msg = "Extraction Failed. No file was found in the extraction directory ({})"\ - .format(config.EXTRACTION_FOLDER) + .format(extraction_folder) return Response(msg, status=500) failed_to_extract = "" @@ -124,12 +144,12 @@ def run_extraction(): msg += "The following pdf files, however, did not get extracted:\n" + failed_to_extract if s3_usage: - s3c_interim.upload_files_in_dir_to_prefix(config.EXTRACTION_FOLDER, + s3c_interim.upload_files_in_dir_to_prefix(extraction_folder, project_prefix + '/interim/ml/extraction') # clear folder - create_directory(config.EXTRACTION_FOLDER) - create_directory(config.ANNOTATION_FOLDER) - create_directory(config.PDF_FOLDER) + create_directory(extraction_folder) + create_directory(annotation_folder) + create_directory(pdf_folder) time_elapsed = str(timedelta(seconds=t2 - t1)) msg += "\nTime elapsed:{}".format(time_elapsed) return Response(msg, status=200) @@ -137,19 +157,36 @@ def run_extraction(): @app.route('/curate/') def run_curation(): - args = json.loads(request.args['payload']) + #args = json.loads(request.args['payload']) + + # Local args dictionary + args = {"project_name": 'TEST'} + args.update({"s3_usage": False}) + curation_settings = {} + curation_settings.update({"retrieve_paragraph": False}) + curation_settings.update({"neg_pos_ratio": 1}) + curation_settings.update({"columns_to_read": ["company", "source_file", "source_page", "kpi_id", "year", "answer", "data_type", "relevant_paragraphs"]}) + curation_settings.update({"company_to_exclude": []}) + curation_settings.update({"create_neg_samples": True}) + curation_settings.update({"min_length_neg_sample": 50}) + curation_settings.update({"seed": 41}) + args.update({"curation": curation_settings}) + + # Load the extraction settings from args project_name = args["project_name"] - curation_settings = args["curation"] + curation_settings = args['curation'] BASE_DATA_PROJECT_FOLDER = config.DATA_FOLDER / project_name BASE_INTERIM_FOLDER = BASE_DATA_PROJECT_FOLDER / 'interim' / 'ml' - config.EXTRACTION_FOLDER = BASE_INTERIM_FOLDER / 'extraction' + extraction_folder = BASE_INTERIM_FOLDER / 'extraction' config.CURATION_FOLDER = BASE_INTERIM_FOLDER / 'curation' - config.ANNOTATION_FOLDER = BASE_INTERIM_FOLDER / 'annotations' + annotation_folder = BASE_INTERIM_FOLDER / 'annotations' config.KPI_FOLDER = BASE_DATA_PROJECT_FOLDER / 'interim' / 'kpi_mapping' - create_directory(config.EXTRACTION_FOLDER) - create_directory(config.CURATION_FOLDER) - create_directory(config.ANNOTATION_FOLDER) + + os.makedirs(extraction_folder, exist_ok=True) + os.makedirs(config.CURATION_FOLDER, exist_ok=True) + os.makedirs(annotation_folder, exist_ok=True) + os.makedirs(config.KPI_FOLDER, exist_ok=True) s3_usage = args["s3_usage"] if s3_usage: @@ -169,9 +206,9 @@ def run_curation(): s3_bucket=os.getenv(s3_settings['interim_bucket']['s3_bucket_name']), ) s3c_main.download_files_in_prefix_to_dir(project_prefix + '/input/kpi_mapping', config.KPI_FOLDER) - s3c_interim.download_files_in_prefix_to_dir(project_prefix + '/interim/ml/extraction', config.EXTRACTION_FOLDER) + s3c_interim.download_files_in_prefix_to_dir(project_prefix + '/interim/ml/extraction', extraction_folder) s3c_main.download_files_in_prefix_to_dir(project_prefix + '/input/annotations', - config.ANNOTATION_FOLDER) + annotation_folder) shutil.copyfile(os.path.join(config.KPI_FOLDER, "kpi_mapping.csv"), "/app/code/kpi_mapping.csv") @@ -183,21 +220,21 @@ def run_curation(): config.TextCurator_kwargs['min_length_neg_sample'] = curation_settings['min_length_neg_sample'] config.SEED = curation_settings['seed'] - try: - if len(config.CURATORS) != 0: - cur = Curator(config.CURATORS) - cur.run(config.EXTRACTION_FOLDER, config.ANNOTATION_FOLDER, config.CURATION_FOLDER) - except Exception as e: - msg = "Error during curation\nException:" + str(repr(e)) + traceback.format_exc() - return Response(msg, status=500) + #try: + if len(config.CURATORS) != 0: + cur = Curator(config.CURATORS) + cur.run(extraction_folder, annotation_folder, config.CURATION_FOLDER) + #except Exception as e: + #msg = "Error during curation\nException:" + str(repr(e)) + traceback.format_exc() + #return Response(msg, status=500) if s3_usage: s3c_interim.upload_files_in_dir_to_prefix(config.CURATION_FOLDER, project_prefix + '/interim/ml/curation') # clear folder create_directory(config.KPI_FOLDER) - create_directory(config.EXTRACTION_FOLDER) - create_directory(config.ANNOTATION_FOLDER) + create_directory(extraction_folder) + create_directory(annotation_folder) create_directory(config.CURATION_FOLDER) return Response("Curation OK", status=200) diff --git a/data_extractor/code/esg_data_pipeline/esg_data_pipeline/utils/__init__.py b/data_extractor/code/esg_data_pipeline/esg_data_pipeline/utils/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/data_extractor/code/esg_data_pipeline/requirements.txt b/data_extractor/code/esg_data_pipeline/requirements.txt index da732fb..dcb23f3 100644 --- a/data_extractor/code/esg_data_pipeline/requirements.txt +++ b/data_extractor/code/esg_data_pipeline/requirements.txt @@ -1,18 +1,8 @@ -traitlets==5.4 -requests==2.26.0 -jinja2==3.0 -ipython==7.23.1 -urllib3==1.26.7 -pdf2image==1.13.1 -tabula-py==2.1.1 -gdown==3.11.1 -fuzzywuzzy==0.18.0 -python-Levenshtein==0.12.0 -tqdm==4.48.0 -xlrd==1.2.0 -pdfminer.six==20221105 -pandas==1.0.5 -Flask==2.2.5 -cryptography==40.0.2 -pyOpenSSL==23.2.0 +jinja2 +urllib3==2.0.7 +pdf2image +xlrd +pdfminer +pandas +Flask boto3 diff --git a/data_extractor/code/esg_data_pipeline/setup.py b/data_extractor/code/esg_data_pipeline/setup.py index 4447b5f..4fe6d66 100644 --- a/data_extractor/code/esg_data_pipeline/setup.py +++ b/data_extractor/code/esg_data_pipeline/setup.py @@ -1,4 +1,3 @@ -import io import os from pathlib import Path diff --git a/data_extractor/code/esg_data_pipeline/test/test_extraction_server.py b/data_extractor/code/esg_data_pipeline/test/test_extraction_server.py new file mode 100644 index 0000000..3737f9e --- /dev/null +++ b/data_extractor/code/esg_data_pipeline/test/test_extraction_server.py @@ -0,0 +1,83 @@ +import pytest + +class TestRunExtraction: + + # Extraction finishes successfully with valid input files and settings. + @pytest.fixture + def mock_s3_communication(mocker): + return mocker.patch('extraction_server.S3Communication') + + @pytest.fixture + def mock_extractor(mocker): + return mocker.patch('extraction_server.Extractor') + + def test_extraction_success_valid_input(self, mocker, mock_s3_communication, mock_extractor): + # Mock the necessary dependencies + mock_s3_communication.return_value = mocker.Mock() + mock_extractor.return_value = mocker.Mock() + + # Invoke the function under test + run_extraction() + + # Assert that the necessary methods were called + mock_s3_communication.assert_called_once() + mock_extractor.assert_called_once() + + def test_extraction_success_valid_input_s3_usage(self, mocker, mock_s3_communication, mock_extractor): + # Mock the necessary dependencies + mock_s3_communication.return_value = mocker.Mock() + mock_extractor.return_value = mocker.Mock() + + # Invoke the function under test + run_extraction() + + # Assert that the necessary methods were called + mock_s3_communication.assert_called_once() + mock_extractor.assert_called_once() + + def test_extraction_success_valid_input_s3_usage_train_mode(self, mocker, mock_s3_communication, mock_extractor): + # Mock the necessary dependencies + mock_s3_communication.return_value = mocker.Mock() + mock_extractor.return_value = mocker.Mock() + + # Invoke the function under test + run_extraction() + + # Assert that the necessary methods were called + mock_s3_communication.assert_called_once() + mock_extractor.assert_called_once() + + def test_extraction_fail_no_pdf_files(self, mocker, mock_s3_communication, mock_extractor): + # Mock the necessary dependencies + mock_s3_communication.return_value = mocker.Mock() + mock_extractor.return_value = mocker.Mock() + + # Invoke the function under test + response = run_extraction() + + # Assert that the response status is 500 + assert response.status_code == 500 + + + def test_extraction_fail_no_annotations_csv(self, mocker, mock_s3_communication, mock_extractor): + # Mock the necessary dependencies + mock_s3_communication.return_value = mocker.Mock() + mock_extractor.return_value = mocker.Mock() + + # Invoke the function under test + response = run_extraction() + + # Assert that the response status is 500 + assert response.status_code == 500 + + + def test_extraction_fail_multiple_annotations_csv(self, mocker, mock_s3_communication, mock_extractor): + # Mock the necessary dependencies + mock_s3_communication.return_value = mocker.Mock() + mock_extractor.return_value = mocker.Mock() + + # Invoke the function under test + response = run_extraction() + + # Assert that the response status is 500 + assert response.status_code == 500 diff --git a/data_extractor/code/local_extraction.py b/data_extractor/code/local_extraction.py new file mode 100644 index 0000000..0ca516e --- /dev/null +++ b/data_extractor/code/local_extraction.py @@ -0,0 +1,4 @@ +import esg_data_pipeline.esg_data_pipeline.extraction_server + +if __name__ == '__main__': + esg_data_pipeline.esg_data_pipeline.extraction_server.run_curation() diff --git a/data_extractor/code/esg_data_pipeline/esg_data_pipeline/utils/kpi_mapping.py b/data_extractor/code/utils/kpi_mapping.py similarity index 92% rename from data_extractor/code/esg_data_pipeline/esg_data_pipeline/utils/kpi_mapping.py rename to data_extractor/code/utils/kpi_mapping.py index ba2f09f..711573d 100644 --- a/data_extractor/code/esg_data_pipeline/esg_data_pipeline/utils/kpi_mapping.py +++ b/data_extractor/code/utils/kpi_mapping.py @@ -1,6 +1,4 @@ import pandas as pd -from esg_data_pipeline.config import config -import os try: df = pd.read_csv("/app/code/kpi_mapping.csv", header=0) diff --git a/data_extractor/data/TEST/interim/kpi_mapping/kpi_mapping.csv b/data_extractor/data/TEST/interim/kpi_mapping/kpi_mapping.csv new file mode 100644 index 0000000..30f42bb --- /dev/null +++ b/data_extractor/data/TEST/interim/kpi_mapping/kpi_mapping.csv @@ -0,0 +1,2 @@ +kpi_id,question,sectors,add_year,kpi_category,, +0,What is the company name?,"OG, CM, CU",FALSE,TEXT,, diff --git a/data_extractor/data/TEST/interim/ml/annotations/test_annotations.xlsx b/data_extractor/data/TEST/interim/ml/annotations/test_annotations.xlsx new file mode 100644 index 0000000..c5746a1 Binary files /dev/null and b/data_extractor/data/TEST/interim/ml/annotations/test_annotations.xlsx differ diff --git a/data_extractor/data/TEST/interim/ml/curation/esg_TEXT_dataset.csv b/data_extractor/data/TEST/interim/ml/curation/esg_TEXT_dataset.csv new file mode 100644 index 0000000..656c294 --- /dev/null +++ b/data_extractor/data/TEST/interim/ml/curation/esg_TEXT_dataset.csv @@ -0,0 +1 @@ +,question,context,company,source_file,source_page,kpi_id,year,answer,data_type,relevant_paragraphs,annotator,Index,label diff --git a/data_extractor/data/TEST/interim/ml/extraction/Test.json b/data_extractor/data/TEST/interim/ml/extraction/Test.json new file mode 100644 index 0000000..c62d85c --- /dev/null +++ b/data_extractor/data/TEST/interim/ml/extraction/Test.json @@ -0,0 +1 @@ +{"0": ["SUSTAINABILITY REPORT 2019ROYAL DUTCH SHELL PLC"], "1": ["sabotage of pipelines, as well as illegal oil refining. We also haveprogrammes in place to reduce the number of operational spills overthe long term. In 2019, we continued to carry out vital work to cleanup Bodo, an area badly affected by oilspills.Being responsible is also about behaving ethically. Our employeesmust show absolute integrity every day. They must meet the ethicalstandards that Shell, and society, expects. Our standards are set out inShell\u2019s business principles and code of conduct. We are very clear thatit is not sufficient for Shell\u2019s actions and behaviour merely to be legallysound. We must take a broad view that also considers the widerimplications of our commercial choices and our stakeholders\u2019 view ofthem. We spent a lot of time in 2019 reinforcing the standard ofbehaviour we expect. For example, all senior executives completed amandatory ethical leadership programme. I strongly believe all leadersmust set the tone from thetop.SUSTAINABLE ENERGY FUTUREThe second area we focus on is to help shape a more sustainableenergyfuture.That is why we are taking action to provide lower-carbon products to helpcustomers reduce their emissions. These are products that people rely onto live their lives, in their homes and businesses, and fortransport.We continue to work towards delivering on our Net Carbon Footprintambition to cut the intensity of the greenhouse gas emissions of theenergy products we sell by about 50% by 2050, and 20% by 2035compared to our 2016 levels, in step with society as it moves towardsmeeting the goals of the Paris Agreement. In 2019, we set shorter-termtargets for 2021 of 2-3% lower than our 2016 baseline Net CarbonFootprint. In early 2020, we set a Net Carbon Footprint target for2022 of 3-4% lower than our 2016 baseline. We will continue toevolve our approach overtime.We are taking action to achieve this ambition. In 2019, we continuedto offer lower-emission energy products, including natural gas,biofuels, hydrogen and renewable power. We increased ourinvestment in natural ecosystems that produce carbon credits to helpdrivers in two key markets, the Netherlands and the UK, to offset theircarbon emissions. And we increased our use of detection and repairprogrammes at our gas production sites to reduce leaks of methane, apotent greenhousegas.Of course, the task of tackling climate change is bigger than anysingle company. Everyone on the planet, from consumers, tobusinesses, to governments, must play their part in reducinggreenhouse gas emissions. Everyone must work together. One form ofcollaboration is for businesses like Shell, which supply energy, to workalongside businesses that use energy, to decarbonise their sector. Theshipping industry is one sector where such an approach could have ahuge impact. For example, the Getting to Zero Coalition bringstogether more than 90 companies to find a way to put a commerciallyviable net-zero emissions ship to sea by2030.CONTRIBUTION TO SOCIETYThe third area of sustainability for us \u2013 and it is a critical one \u2013 is tomake a positive contribution tosociety.Meeting society\u2019s expectations involves playing a positive role incommunities where we operate and in wider society. We do this bycreating jobs, developing talent and using local suppliers. We alsoinvest in education programmes to equip young aspiring engineers andscientists with the tools and skills needed to become futureinnovators.In 2019, we made further progress in providing energy to people whowould otherwise go without basics such as electric lighting. We madeseveral investments to help provide reliable electricity across Africa,Asia and beyond. This supports the effort to help to achieve universalaccess to clean, affordable energy, one of the manyUNsustainabledevelopment goals to which wecontribute.Contributing to society also means gaining and maintaining people\u2019strust. We do this by being as open as we can about what we do andwhy we do it. For example, we are being increasingly transparentabout the industry groups we are part of. In 2019, we published theIndustry Associations Climate Review, which assessed for the first timeShell\u2019s alignment with 19 industry associations on climate-relatedpolicy. We also published our first Tax Contribution Report in 2019,which presents Shell\u2019s approach to tax and explains how our businessactivities are taxedglobally.This Sustainability Report details our activities during 2019. The reportbuilds on our actions on sustainability and transparency. We are afounding member of the UN Global Compact and we also continue tosupport its corporate governance principles on human rights,environmental protection, anti-corruption and better labourpractices.Once again, I would like to thank the members of the independentReport Review Panel, who help us provide more balanced, relevantand responsivereporting.This report shows much progress. But Shell must further step up effortson all fronts, from climate change to ethical leadership to greatertransparency. We must continue to make a real contribution topeople\u2019s lives. We can only do this by keeping our approach tosustainability at the heart of the way we dobusiness.Ben vBen van Beuran BeurdendenChief Executive Officer1.Introduction2.Responsiblebusiness3.Sustainableenergy future4.Contributionto society5.Specialreports6.Ourperformancedata05Shell2019 Sustainability Report"], "2": ["MANAGINGGREENHOUSE GASEMISSIONSGREENHOUSE GAS EMISSIONSWe are taking action to manage the emissions from our ownoperations and the emissions from the energy we use inouroperations.Improving the energy efficiency of our facilities is one of the ways tohelp us achieve our Net Carbon Footprint ambition to cut the intensityof the greenhouse gas (GHG) emissions of the energy products we sellby around half by 2050, in step with society\u2019s progress to align withthe goal of the ParisAgreement.We require projects and facilities that produce more than 50,000tonnes ofGHGemissions a year to have a GHG and energymanagement plan inplace.These plans help drive our emissions performance through variousactions. This includes using more energy-efficient equipment, installingpower from renewable sources and considering carbon capture andstorage in the design of our new and largestprojects.GHG and energy management plans must include the sources ofGHG emissions, as well as a forecast of expected emissions at the sitefor at least 10 years. Projects under development that are expected tohave a materialGHGfootprint must meet carbon performancestandards or industrybenchmarks.During development, projects are expected to evaluate relevant low-carbon technologies and options to remove GHG emissions. To assessthe resilience of proposed projects, we consider factors such aspotential costs associated with operational GHGemissions.We use estimates of future carbon costs that are specific to eachcountry. This is an important part of our efforts to stay in step withsociety\u2019s progress toward the goals of the Paris Agreement. Theseestimates were developed using the current Nationally DeterminedContributions (NDCs) submitted by countries as part of the ParisAgreement. By 2050, our estimates for all countries increase to $85 atonne of GHGemissions.They are the first NDCs under the Paris Agreement and are scheduledto be revised every five years. Therefore, as countries update theirNDCs, we expect to update our estimates too. Accordingly, webelieve they are a more accurate reflection of society\u2019s currentimplementation of the Paris Agreement. TheUNbelieves the currentNDCs are consistent with limiting the average global temperature riseto around three degrees Celsius above pre-industrial levels. In comingdecades, we expect countries to tighten these NDCs to meet the goalsof the ParisAgreement.We have also developed and implemented a comprehensive CO2andenergy management information system that supports our facilities, forexample, by analysing real-time data to highlight maintenance gapsand monitorperformance.Greenhouse gas emissions performanceOur direct GHG emissions decreased from71milliontonnes of CO2equivalent in 2018 to70milliontonnes of CO2equivalent in 2019.The main reasons for the decrease were divestments (for example, inArgentina, Canada, Iraq, Malaysia, Norway and the UK). Thesedecreases were partly offset by the start-up of the Prelude floatingliquefied natural gas facility inAustralia.A Shell employee inspects equipment for potential methane leaks ata facility in Pennsylvania,USA.1.Introduction2.Responsiblebusiness3.Sustainableenergyfuture4.Contributionto society5.Specialreports6.Ourperformancedata43Shell2019 Sustainability Report"], "3": ["ENVIRONMENTAL DATAEnvironmental performance data2019201820172016201520142013201220112010GrGreenhouse gaeenhouse gas (Gs (GHG) emissionsHG) emissionsTToottal Gal GHG emissionsHG emissionsNet Carbon Footprint (gCO2e/MJ)7878797979Direct GHG emissions (Scope 1) (million tonnes CO2equivalent)[A]7070717370727673727476Carbon dioxide (CO2) (million tonnes)6767687067687371697172Methane (CH4) (thousand tonnes)[P]919192123138132134120102143128Nitrous oxide (N2O) (thousand tonnes)11111111112Hydrofluorocarbons (HFCs) (tonnes)[P]2929312221201618232223Energy indirect GHG emissions (Scope 2) (million tonnesCO2equivalent)[B]1010111211910109109GHG emissions associated with exported energy (subsetof direct GHGs)3333323Use of our refinery and natural gas products (Scope3Category11) (milliontonnesCO2equivalent)[Q]576576599579600560600600580570670GGHG emissions brHG emissions breeakakdodown bwn by businey business (Sss (Sccopeope1 and1 and2)2)Scope 1 \u2013 Upstream (million tonnes CO2equivalent)12.912.914.819.618.7Scope 1\u2013 Integrated Gas (million tonnes CO2equivalent)16.316.313.012.013.7Scope 1\u2013 Downstream (million tonnes CO2equivalent)40.340.342.241.137.6Scope 2 \u2013 Upstream[B](million tonnes CO2equivalent)1.11.11.41.41.4Scope 2 \u2013 Integrated Gas[B](million tonnes CO2equivalent)1.61.62.42.42.0Scope 2 \u2013 Downstream[B](million tonnes CO2equivalent)7.37.36.87.57.3GGHG intensitHG intensity by by Businey BusinessssUpstream and Integrated Gas GHG intensity0.1680.1680.1580.1660.166Refinery GHG intensity1.061.061.051.141.18Chemical GHG intensity1.041.040.960.950.99FFlarlaringingFlaring (upstream) (million tonnes CO2equivalent)[C][P]5.95.95.28.27.611.812.58.07.710.710.6Flaring (upstream) (million tonnes hydrocarbonflared)[C][P]1.81.81.52.52.33.53.72.42.33.43.5Nigeria[D][P]0.70.70.60.80.50.91.21.21.52.02.4Rest of the world[E]1.21.21.01.71.82.62.51.10.81.41.0EnerEnergy intensitgy intensityyUpstream excl. oil sands, LNG and GTL (gigajoules pertonne production)[C][F]1.071.071.061.051.020.830.870.890.830.750.74Refineries: Refinery Energy Index[G]94.494.494.394.895.495.494.995.698.4100.8101.8Chemical plants: Chemicals Energy Intensity19.719.718.317.618.919.6Acid gaAcid gaseses and Vs and VOCsOCsSulphur oxides (SOx) (thousand tonnes SO2)[P]6565748183889799113136139Nitrogen oxides (NOx) (thousand tonnes NO2)108108111107113104146144147146159Volatile organic compounds (VOCs) (thousandtonnes)[P]5555599515313115189891291471.Introduction2.Responsiblebusiness3.Sustainableenergy future4.Contributionto society5.Specialreports6.Ourperformancedata88Shell2019 Sustainability Report"]} \ No newline at end of file diff --git a/data_extractor/data/TEST/interim/pdfs/Test.pdf b/data_extractor/data/TEST/interim/pdfs/Test.pdf new file mode 100644 index 0000000..21ccb3e Binary files /dev/null and b/data_extractor/data/TEST/interim/pdfs/Test.pdf differ