Skip to content

Get news from websites #1081

Get news from websites

Get news from websites #1081

Workflow file for this run

name: Get news from websites
on:
workflow_dispatch: # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#workflow_dispatch
schedule: # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#schedule
- cron: '26 15 * * *'
jobs:
click-here-to-see-data:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v3
# - uses: coursier/cache-action@v6
- name: Set up JDK
uses: actions/setup-java@v3
with:
java-version: '11'
distribution: 'temurin'
cache: 'sbt'
- name: Set yesterday date year month as env variable for partitioning
run: echo "PATH_PARTITION=$(date -d "yesterday" +'year=%Y/month=%-m')" >> $GITHUB_ENV
- name: Set yesterday date year month day as env variable for partitioning
run: echo "PATH_DAY_PARTITION=$(date -d "yesterday" +'year=%Y/month=%-m/day=%-d')" >> $GITHUB_ENV
- name: Set 2 day ago date year month day as env variable for partitioning
run: echo "PATH_DAY_2_DAY_AGO_PARTITION=$(date -d "2 days ago" +'year=%Y/month=%-m/day=%-d')" >> $GITHUB_ENV
- name: Set 3 day ago date year month day as env variable for partitioning
run: echo "PATH_DAY_3_DAY_AGO_PARTITION=$(date -d "3 days ago" +'year=%Y/month=%-m/day=%-d')" >> $GITHUB_ENV
- name: Set yesterday date as env variable for partitioning
run: echo "PATH_YEAR_PARTITION=$(date -d "yesterday" +'year=%Y')" >> $GITHUB_ENV
- name: Set last month as env variable for partitioning
run: echo "PATH_LAST_MONTH_PARTITION=$(date -d "last month" +'year=%Y/month=%-m')" >> $GITHUB_ENV
- name: List France 2 news urls containing global warming (see end)
run: sbt "runMain com.github.polomarcus.main.TelevisionNewsAnalyser 1 france2"
- name: Organize and refresh data for France 2
run: |
ls -R output-france2-tv-news-json/
- name: List data for France 2
run: ls -R data-news-json/media=France\ 2/
- name: List TF1 news urls containing global warming (see end)
run: sbt "runMain com.github.polomarcus.main.TelevisionNewsAnalyser 2 tf1"
- name: Organize and refresh new data for TF1
run: |
ls -R output-tf1-tv-news-json/
- name: List new data for TF1
run: |
ls -R data-news-json/media=TF1/
- name: Create website data and save data with git
uses: actions/checkout@v3
with:
clean: false
- run: |
mkdir -p data-news-json/media=France\ 2/${{ env.PATH_YEAR_PARTITION }}
mkdir -p data-news-json/media=France\ 2/${{ env.PATH_PARTITION }}
rm -rf data-news-json/media=France\ 2/${{ env.PATH_DAY_PARTITION }} || true
rm -rf data-news-json/media=France\ 2/${{ env.PATH_DAY_2_DAY_AGO_PARTITION }} || true
rm -rf data-news-json/media=France\ 2/${{ env.PATH_DAY_3_DAY_AGO_PARTITION }} || true
mkdir -p output-france2-tv-news-json/media=France\ 2/${{ env.PATH_LAST_MONTH_PARTITION }}
mkdir -p output-france2-tv-news-json/media=France\ 2/${{ env.PATH_PARTITION }}
mv -vn output-france2-tv-news-json/media=France\ 2/${{ env.PATH_PARTITION }}/** data-news-json/media=France\ 2/${{ env.PATH_PARTITION }} || true
mv -vn output-france2-tv-news-json/media=France\ 2/${{ env.PATH_LAST_MONTH_PARTITION }}/** data-news-json/media=France\ 2/${{ env.PATH_LAST_MONTH_PARTITION }} || true
mkdir -p data-news-json/media=TF1/${{ env.PATH_YEAR_PARTITION }}
mkdir -p data-news-json/media=TF1/${{ env.PATH_PARTITION }}
rm -rf data-news-json/media=TF1/${{ env.PATH_DAY_PARTITION }} || true
rm -rf data-news-json/media=TF1/${{ env.PATH_DAY_2_DAY_AGO_PARTITION }} || true
rm -rf data-news-json/media=TF1/${{ env.PATH_DAY_3_DAY_AGO_PARTITION }} || true
mkdir -p output-tf1-tv-news-json/media=TF1/${{ env.PATH_PARTITION }}
mv -vn output-tf1-tv-news-json/media=TF1/${{ env.PATH_PARTITION }}/** data-news-json/media=TF1/${{ env.PATH_PARTITION }} || true
mv -vn output-tf1-tv-news-json/media=TF1/${{ env.PATH_LAST_MONTH_PARTITION }}/** data-news-json/media=TF1/${{ env.PATH_LAST_MONTH_PARTITION }} || true
sbt "runMain com.github.polomarcus.main.UpdateNews"
git config user.name polomarcus-github-actions
git config user.email [email protected]
git status
git add .
git commit -m "ci(data): saved date ${{ env.PATH_DAY_PARTITION }} and ${{ env.PATH_DAY_2_DAY_AGO_PARTITION }} and ${{ env.PATH_DAY_3_DAY_AGO_PARTITION }}"
git push origin main
data-quality:
runs-on: ubuntu-20.04
needs: click-here-to-see-data
steps:
- uses: actions/checkout@v3
# - uses: coursier/cache-action@v6
- name: Set up JDK 11
uses: actions/setup-java@v3
with:
java-version: '11'
distribution: 'temurin'
cache: 'sbt'
- name: Check for duplicates to delete inside data-news-json folder
run: sbt "runMain com.github.polomarcus.main.DataQuality"