Skip to content

Common Crawler

Common Crawler #1

name: Common Crawler
# Pull request will run every day at 1AM.
on:
schedule:
- cron: '0 1 * * *'
env:
# The access token enabling write access to the Huggingface Database
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
jobs:
build-and-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
# This is necessary to push commits back to the repository
persist-credentials: true
fetch-depth: 0 # Fetch all history for all tags and branches
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.11.8
- name: Upgrade pip
run: python -m pip install --upgrade pip
- name: Install dependencies
run: pip install common_crawler/requirements_common_crawler_action.txt
- name: Run script
run: python common_crawler/main.py CC-MAIN-2023-50 *.gov police --config common_crawler/config.ini --pages 20
- name: Configure Git
run: |
git config --local user.email "[email protected]"
git config --local user.name "GitHub Action"
- name: Add common_crawler cache
run: git add common_crawler/data/cache.json
- name: Commit changes
run: git commit -m "Update common_crawler cache"
- name: Push changes
run: git push