From 9b149222392212676884209b1ea41498f50728bc Mon Sep 17 00:00:00 2001 From: Richard Tibbles Date: Wed, 10 Aug 2022 12:38:23 -0700 Subject: [PATCH 1/4] Raise FileNotFoundError when file cannot be found on GCS. --- contentcuration/contentcuration/utils/gcs_storage.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/contentcuration/contentcuration/utils/gcs_storage.py b/contentcuration/contentcuration/utils/gcs_storage.py index bef04a7798..07d8e899f3 100644 --- a/contentcuration/contentcuration/utils/gcs_storage.py +++ b/contentcuration/contentcuration/utils/gcs_storage.py @@ -58,6 +58,9 @@ def open(self, name, mode="rb", blob_object=None): else: blob = blob_object + if blob is None: + raise FileNotFoundError("{} not found".format(name)) + fobj = tempfile.NamedTemporaryFile() blob.download_to_file(fobj) # flush it to disk From 89ec2e94ca15c030900cc2efd14ef87c7240351f Mon Sep 17 00:00:00 2001 From: Richard Tibbles Date: Tue, 30 Aug 2022 16:37:42 -0700 Subject: [PATCH 2/4] Handle failure to infer duration from streamed media. --- .../management/commands/set_file_duration.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/contentcuration/contentcuration/management/commands/set_file_duration.py b/contentcuration/contentcuration/management/commands/set_file_duration.py index fd09aaf51d..1e828dac05 100644 --- a/contentcuration/contentcuration/management/commands/set_file_duration.py +++ b/contentcuration/contentcuration/management/commands/set_file_duration.py @@ -14,7 +14,7 @@ CHUNKSIZE = 10000 -def extract_duration_of_media(f_in, extension): +def extract_duration_of_media(f_in, extension): # noqa C901 """ For more details on these commands, refer to the ffmpeg Wiki: https://trac.ffmpeg.org/wiki/FFprobeTips#Formatcontainerduration @@ -55,9 +55,12 @@ def extract_duration_of_media(f_in, extension): stdin=f_in, stderr=subprocess.PIPE ) - second_last_line = result.stderr.decode("utf-8").strip().splitlines()[-2] - time_code = second_last_line.split(" time=")[1].split(" ")[0] - hours, minutes, seconds = time_code.split(":") + try: + second_last_line = result.stderr.decode("utf-8").strip().splitlines()[-2] + time_code = second_last_line.split(" time=")[1].split(" ")[0] + hours, minutes, seconds = time_code.split(":") + except IndexError: + raise RuntimeError("Unable to determine media length") try: hours = int(hours) except ValueError: @@ -103,7 +106,7 @@ def handle(self, *args, **options): except FileNotFoundError: logging.warning("File {} not found".format(file)) excluded_files.add(file.file_on_disk.name) - except subprocess.CalledProcessError: + except (subprocess.CalledProcessError, RuntimeError): logging.warning("File {} could not be read for duration".format(file)) excluded_files.add(file.file_on_disk.name) From 13576b41f2e2ec6bb3d5b9a29485371c0d717ebb Mon Sep 17 00:00:00 2001 From: Richard Tibbles Date: Tue, 30 Aug 2022 17:39:19 -0700 Subject: [PATCH 3/4] Adds a management command to infer from the database if a resource node should have be annotated with the has captions metadata. --- Makefile | 3 ++ .../commands/set_orm_based_has_captions.py | 50 +++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 contentcuration/contentcuration/management/commands/set_orm_based_has_captions.py diff --git a/Makefile b/Makefile index 383bd3ad32..282b90de57 100644 --- a/Makefile +++ b/Makefile @@ -141,6 +141,9 @@ filedurations: learningactivities: python contentcuration/manage.py set_default_learning_activities +hascaptions: + python contentcuration/manage.py set_orm_based_has_captions + export COMPOSE_PROJECT_NAME=studio_$(shell git rev-parse --abbrev-ref HEAD) purge-postgres: diff --git a/contentcuration/contentcuration/management/commands/set_orm_based_has_captions.py b/contentcuration/contentcuration/management/commands/set_orm_based_has_captions.py new file mode 100644 index 0000000000..3eaf83ce37 --- /dev/null +++ b/contentcuration/contentcuration/management/commands/set_orm_based_has_captions.py @@ -0,0 +1,50 @@ +import logging as logmodule +import time + +from django.core.management.base import BaseCommand +from django.db.models import Exists +from django.db.models import OuterRef +from le_utils.constants import content_kinds +from le_utils.constants import format_presets +from le_utils.constants.labels import accessibility_categories + +from contentcuration.models import ContentNode +from contentcuration.models import File + +logmodule.basicConfig(level=logmodule.INFO) +logging = logmodule.getLogger('command') + + +CHUNKSIZE = 10000 + + +class Command(BaseCommand): + + def handle(self, *args, **options): + start = time.time() + + logging.info("Setting 'has captions' for video kinds") + + # Only try to update video nodes which have not had any accessibility labels set on them + # this will allow this management command to be rerun and resume from where it left off + # and also prevent stomping previous edits to the accessibility_labels field. + updateable_nodes = ContentNode.objects.annotate( + has_captions=Exists(File.objects.filter(contentnode=OuterRef("id"), language=OuterRef("language"), preset_id=format_presets.VIDEO_SUBTITLE)) + ).filter(kind=content_kinds.VIDEO, accessibility_labels__isnull=True, has_captions=True) + + updateable_node_slice = list(updateable_nodes.values_list("id", flat=True)[0:CHUNKSIZE]) + + count = 0 + + while updateable_node_slice: + ContentNode.objects.filter(id__in=updateable_node_slice).update(accessibility_labels={accessibility_categories.CAPTIONS_SUBTITLES: True}) + + this_count = len(updateable_node_slice) + + logging.info("Set has captions metadata for {} nodes".format(this_count)) + + count += this_count + + updateable_node_slice = list(updateable_nodes.values_list("id", flat=True)[0:CHUNKSIZE]) + + logging.info('Finished setting all has captions metadata for {} nodes in {} seconds'.format(count, time.time() - start)) From 86178721937e1db64cc7fbd9abb50a9cc9e149c7 Mon Sep 17 00:00:00 2001 From: Richard Tibbles Date: Wed, 31 Aug 2022 11:59:53 -0700 Subject: [PATCH 4/4] Simplified subquery used based on review feedback. --- .../commands/set_orm_based_has_captions.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/contentcuration/contentcuration/management/commands/set_orm_based_has_captions.py b/contentcuration/contentcuration/management/commands/set_orm_based_has_captions.py index 3eaf83ce37..edbcbbcd40 100644 --- a/contentcuration/contentcuration/management/commands/set_orm_based_has_captions.py +++ b/contentcuration/contentcuration/management/commands/set_orm_based_has_captions.py @@ -25,26 +25,25 @@ def handle(self, *args, **options): logging.info("Setting 'has captions' for video kinds") + has_captions_subquery = Exists(File.objects.filter(contentnode=OuterRef("id"), language=OuterRef("language"), preset_id=format_presets.VIDEO_SUBTITLE)) # Only try to update video nodes which have not had any accessibility labels set on them # this will allow this management command to be rerun and resume from where it left off # and also prevent stomping previous edits to the accessibility_labels field. - updateable_nodes = ContentNode.objects.annotate( - has_captions=Exists(File.objects.filter(contentnode=OuterRef("id"), language=OuterRef("language"), preset_id=format_presets.VIDEO_SUBTITLE)) - ).filter(kind=content_kinds.VIDEO, accessibility_labels__isnull=True, has_captions=True) + updateable_nodes = ContentNode.objects.filter(has_captions_subquery, kind=content_kinds.VIDEO, accessibility_labels__isnull=True) - updateable_node_slice = list(updateable_nodes.values_list("id", flat=True)[0:CHUNKSIZE]) + updateable_node_slice = updateable_nodes.values_list("id", flat=True)[0:CHUNKSIZE] count = 0 - while updateable_node_slice: - ContentNode.objects.filter(id__in=updateable_node_slice).update(accessibility_labels={accessibility_categories.CAPTIONS_SUBTITLES: True}) - - this_count = len(updateable_node_slice) + while updateable_nodes.exists(): + this_count = ContentNode.objects.filter( + id__in=updateable_node_slice + ).update(accessibility_labels={accessibility_categories.CAPTIONS_SUBTITLES: True}) logging.info("Set has captions metadata for {} nodes".format(this_count)) count += this_count - updateable_node_slice = list(updateable_nodes.values_list("id", flat=True)[0:CHUNKSIZE]) + updateable_node_slice = updateable_nodes.values_list("id", flat=True)[0:CHUNKSIZE] logging.info('Finished setting all has captions metadata for {} nodes in {} seconds'.format(count, time.time() - start))