From 9b149222392212676884209b1ea41498f50728bc Mon Sep 17 00:00:00 2001
From: Richard Tibbles <richard@learningequality.org>
Date: Wed, 10 Aug 2022 12:38:23 -0700
Subject: [PATCH 1/4] Raise FileNotFoundError when file cannot be found on GCS.

---
 contentcuration/contentcuration/utils/gcs_storage.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/contentcuration/contentcuration/utils/gcs_storage.py b/contentcuration/contentcuration/utils/gcs_storage.py
index bef04a7798..07d8e899f3 100644
--- a/contentcuration/contentcuration/utils/gcs_storage.py
+++ b/contentcuration/contentcuration/utils/gcs_storage.py
@@ -58,6 +58,9 @@ def open(self, name, mode="rb", blob_object=None):
         else:
             blob = blob_object
 
+        if blob is None:
+            raise FileNotFoundError("{} not found".format(name))
+
         fobj = tempfile.NamedTemporaryFile()
         blob.download_to_file(fobj)
         # flush it to disk

From 89ec2e94ca15c030900cc2efd14ef87c7240351f Mon Sep 17 00:00:00 2001
From: Richard Tibbles <richard@learningequality.org>
Date: Tue, 30 Aug 2022 16:37:42 -0700
Subject: [PATCH 2/4] Handle failure to infer duration from streamed media.

---
 .../management/commands/set_file_duration.py        | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/contentcuration/contentcuration/management/commands/set_file_duration.py b/contentcuration/contentcuration/management/commands/set_file_duration.py
index fd09aaf51d..1e828dac05 100644
--- a/contentcuration/contentcuration/management/commands/set_file_duration.py
+++ b/contentcuration/contentcuration/management/commands/set_file_duration.py
@@ -14,7 +14,7 @@
 CHUNKSIZE = 10000
 
 
-def extract_duration_of_media(f_in, extension):
+def extract_duration_of_media(f_in, extension):  # noqa C901
     """
     For more details on these commands, refer to the ffmpeg Wiki:
     https://trac.ffmpeg.org/wiki/FFprobeTips#Formatcontainerduration
@@ -55,9 +55,12 @@ def extract_duration_of_media(f_in, extension):
             stdin=f_in,
             stderr=subprocess.PIPE
         )
-        second_last_line = result.stderr.decode("utf-8").strip().splitlines()[-2]
-        time_code = second_last_line.split(" time=")[1].split(" ")[0]
-        hours, minutes, seconds = time_code.split(":")
+        try:
+            second_last_line = result.stderr.decode("utf-8").strip().splitlines()[-2]
+            time_code = second_last_line.split(" time=")[1].split(" ")[0]
+            hours, minutes, seconds = time_code.split(":")
+        except IndexError:
+            raise RuntimeError("Unable to determine media length")
         try:
             hours = int(hours)
         except ValueError:
@@ -103,7 +106,7 @@ def handle(self, *args, **options):
                 except FileNotFoundError:
                     logging.warning("File {} not found".format(file))
                     excluded_files.add(file.file_on_disk.name)
-                except subprocess.CalledProcessError:
+                except (subprocess.CalledProcessError, RuntimeError):
                     logging.warning("File {} could not be read for duration".format(file))
                     excluded_files.add(file.file_on_disk.name)
 

From 13576b41f2e2ec6bb3d5b9a29485371c0d717ebb Mon Sep 17 00:00:00 2001
From: Richard Tibbles <richard@learningequality.org>
Date: Tue, 30 Aug 2022 17:39:19 -0700
Subject: [PATCH 3/4] Adds a management command to infer from the database if a
 resource node should have be annotated with the has captions metadata.

---
 Makefile                                      |  3 ++
 .../commands/set_orm_based_has_captions.py    | 50 +++++++++++++++++++
 2 files changed, 53 insertions(+)
 create mode 100644 contentcuration/contentcuration/management/commands/set_orm_based_has_captions.py

diff --git a/Makefile b/Makefile
index 383bd3ad32..282b90de57 100644
--- a/Makefile
+++ b/Makefile
@@ -141,6 +141,9 @@ filedurations:
 learningactivities:
 	python contentcuration/manage.py set_default_learning_activities
 
+hascaptions:
+	python contentcuration/manage.py set_orm_based_has_captions
+
 export COMPOSE_PROJECT_NAME=studio_$(shell git rev-parse --abbrev-ref HEAD)
 
 purge-postgres:
diff --git a/contentcuration/contentcuration/management/commands/set_orm_based_has_captions.py b/contentcuration/contentcuration/management/commands/set_orm_based_has_captions.py
new file mode 100644
index 0000000000..3eaf83ce37
--- /dev/null
+++ b/contentcuration/contentcuration/management/commands/set_orm_based_has_captions.py
@@ -0,0 +1,50 @@
+import logging as logmodule
+import time
+
+from django.core.management.base import BaseCommand
+from django.db.models import Exists
+from django.db.models import OuterRef
+from le_utils.constants import content_kinds
+from le_utils.constants import format_presets
+from le_utils.constants.labels import accessibility_categories
+
+from contentcuration.models import ContentNode
+from contentcuration.models import File
+
+logmodule.basicConfig(level=logmodule.INFO)
+logging = logmodule.getLogger('command')
+
+
+CHUNKSIZE = 10000
+
+
+class Command(BaseCommand):
+
+    def handle(self, *args, **options):
+        start = time.time()
+
+        logging.info("Setting 'has captions' for video kinds")
+
+        # Only try to update video nodes which have not had any accessibility labels set on them
+        # this will allow this management command to be rerun and resume from where it left off
+        # and also prevent stomping previous edits to the accessibility_labels field.
+        updateable_nodes = ContentNode.objects.annotate(
+            has_captions=Exists(File.objects.filter(contentnode=OuterRef("id"), language=OuterRef("language"), preset_id=format_presets.VIDEO_SUBTITLE))
+        ).filter(kind=content_kinds.VIDEO, accessibility_labels__isnull=True, has_captions=True)
+
+        updateable_node_slice = list(updateable_nodes.values_list("id", flat=True)[0:CHUNKSIZE])
+
+        count = 0
+
+        while updateable_node_slice:
+            ContentNode.objects.filter(id__in=updateable_node_slice).update(accessibility_labels={accessibility_categories.CAPTIONS_SUBTITLES: True})
+
+            this_count = len(updateable_node_slice)
+
+            logging.info("Set has captions metadata for {} nodes".format(this_count))
+
+            count += this_count
+
+            updateable_node_slice = list(updateable_nodes.values_list("id", flat=True)[0:CHUNKSIZE])
+
+        logging.info('Finished setting all has captions metadata for {} nodes in {} seconds'.format(count, time.time() - start))

From 86178721937e1db64cc7fbd9abb50a9cc9e149c7 Mon Sep 17 00:00:00 2001
From: Richard Tibbles <richard@learningequality.org>
Date: Wed, 31 Aug 2022 11:59:53 -0700
Subject: [PATCH 4/4] Simplified subquery used based on review feedback.

---
 .../commands/set_orm_based_has_captions.py      | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/contentcuration/contentcuration/management/commands/set_orm_based_has_captions.py b/contentcuration/contentcuration/management/commands/set_orm_based_has_captions.py
index 3eaf83ce37..edbcbbcd40 100644
--- a/contentcuration/contentcuration/management/commands/set_orm_based_has_captions.py
+++ b/contentcuration/contentcuration/management/commands/set_orm_based_has_captions.py
@@ -25,26 +25,25 @@ def handle(self, *args, **options):
 
         logging.info("Setting 'has captions' for video kinds")
 
+        has_captions_subquery = Exists(File.objects.filter(contentnode=OuterRef("id"), language=OuterRef("language"), preset_id=format_presets.VIDEO_SUBTITLE))
         # Only try to update video nodes which have not had any accessibility labels set on them
         # this will allow this management command to be rerun and resume from where it left off
         # and also prevent stomping previous edits to the accessibility_labels field.
-        updateable_nodes = ContentNode.objects.annotate(
-            has_captions=Exists(File.objects.filter(contentnode=OuterRef("id"), language=OuterRef("language"), preset_id=format_presets.VIDEO_SUBTITLE))
-        ).filter(kind=content_kinds.VIDEO, accessibility_labels__isnull=True, has_captions=True)
+        updateable_nodes = ContentNode.objects.filter(has_captions_subquery, kind=content_kinds.VIDEO, accessibility_labels__isnull=True)
 
-        updateable_node_slice = list(updateable_nodes.values_list("id", flat=True)[0:CHUNKSIZE])
+        updateable_node_slice = updateable_nodes.values_list("id", flat=True)[0:CHUNKSIZE]
 
         count = 0
 
-        while updateable_node_slice:
-            ContentNode.objects.filter(id__in=updateable_node_slice).update(accessibility_labels={accessibility_categories.CAPTIONS_SUBTITLES: True})
-
-            this_count = len(updateable_node_slice)
+        while updateable_nodes.exists():
+            this_count = ContentNode.objects.filter(
+                id__in=updateable_node_slice
+            ).update(accessibility_labels={accessibility_categories.CAPTIONS_SUBTITLES: True})
 
             logging.info("Set has captions metadata for {} nodes".format(this_count))
 
             count += this_count
 
-            updateable_node_slice = list(updateable_nodes.values_list("id", flat=True)[0:CHUNKSIZE])
+            updateable_node_slice = updateable_nodes.values_list("id", flat=True)[0:CHUNKSIZE]
 
         logging.info('Finished setting all has captions metadata for {} nodes in {} seconds'.format(count, time.time() - start))