Common Crawler #2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Common Crawler | |
# Pull request will run every day at 1AM. | |
on: | |
schedule: | |
- cron: '0 1 * * *' | |
env: | |
# The access token enabling write access to the Huggingface Database | |
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }} | |
jobs: | |
build-and-commit: | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v2 | |
with: | |
# This is necessary to push commits back to the repository | |
persist-credentials: true | |
fetch-depth: 0 # Fetch all history for all tags and branches | |
- name: Set up Python | |
uses: actions/setup-python@v4 | |
with: | |
python-version: 3.11.8 | |
- name: Upgrade pip | |
run: python -m pip install --upgrade pip | |
- name: Install dependencies | |
run: pip install common_crawler/requirements_common_crawler_action.txt | |
- name: Run script | |
run: python common_crawler/main.py CC-MAIN-2023-50 *.gov police --config common_crawler/config.ini --pages 20 | |
- name: Configure Git | |
run: | | |
git config --local user.email "[email protected]" | |
git config --local user.name "GitHub Action" | |
- name: Add common_crawler cache | |
run: git add common_crawler/data/cache.json | |
- name: Commit changes | |
run: git commit -m "Update common_crawler cache" | |
- name: Push changes | |
run: git push |