Add media.ccc specific parsing of recordings

Instead of pushing the `media.ccc` recordings straight into the StreamExtractor straightjacket (which does not really provide the correct API for handling the semantics of the media.ccc data), we parse into an intermediate structure first that contains all relevant information about the different streams. This is required for correct handling of the different languages.
TeamNewPipe · Dec 22, 2024 · 4c2ff3c · 4c2ff3c
1 parent 8e92227
commit 4c2ff3c
Show file tree

Hide file tree

Showing 3 changed files with 224 additions and 51 deletions.
diff --git a/...a/org/schabi/newpipe/extractor/services/media_ccc/extractors/MediaCCCStreamExtractor.java b/...a/org/schabi/newpipe/extractor/services/media_ccc/extractors/MediaCCCStreamExtractor.java
@@ -20,6 +20,7 @@
 import org.schabi.newpipe.extractor.linkhandler.LinkHandler;
 import org.schabi.newpipe.extractor.localization.DateWrapper;
 import org.schabi.newpipe.extractor.localization.Localization;
+import org.schabi.newpipe.extractor.services.media_ccc.extractors.data.MediaCCCRecording;
 import org.schabi.newpipe.extractor.services.media_ccc.linkHandler.MediaCCCConferenceLinkHandlerFactory;
 import org.schabi.newpipe.extractor.services.media_ccc.linkHandler.MediaCCCStreamLinkHandlerFactory;
 import org.schabi.newpipe.extractor.stream.AudioStream;
@@ -28,15 +29,18 @@
 import org.schabi.newpipe.extractor.stream.StreamType;
 import org.schabi.newpipe.extractor.stream.VideoStream;
 import org.schabi.newpipe.extractor.utils.JsonUtils;
-import org.schabi.newpipe.extractor.utils.LocaleCompat;
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 import java.util.Locale;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
 
 import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
 
 public class MediaCCCStreamExtractor extends StreamExtractor {
     private JsonObject data;
@@ -100,64 +104,55 @@ public List<Image> getUploaderAvatars() {
     }
 
     @Override
-    public List<AudioStream> getAudioStreams() throws ExtractionException {
-        final JsonArray recordings = data.getArray("recordings");
+    public List<AudioStream> getAudioStreams() {
+        final List<MediaCCCRecording.Audio> recordings = getRecordings().stream()
+                .flatMap(r ->
+                        r instanceof MediaCCCRecording.Audio
+                                ? Stream.of((MediaCCCRecording.Audio) r)
+                                : Stream.empty()
+                )
+                .collect(Collectors.toList());
         final List<AudioStream> audioStreams = new ArrayList<>();
-        for (int i = 0; i < recordings.size(); i++) {
-            final JsonObject recording = recordings.getObject(i);
-            final String mimeType = recording.getString("mime_type");
-            if (mimeType.startsWith("audio")) {
-                // First we need to resolve the actual video data from the CDN
-                final MediaFormat mediaFormat;
-                if (mimeType.endsWith("opus")) {
-                    mediaFormat = MediaFormat.OPUS;
-                } else if (mimeType.endsWith("mpeg")) {
-                    mediaFormat = MediaFormat.MP3;
-                } else if (mimeType.endsWith("ogg")) {
-                    mediaFormat = MediaFormat.OGG;
-                } else {
-                    mediaFormat = null;
-                }
-
-                final AudioStream.Builder builder = new AudioStream.Builder()
-                        .setId(recording.getString("filename", ID_UNKNOWN))
-                        .setContent(recording.getString("recording_url"), true)
-                        .setMediaFormat(mediaFormat)
-                        .setAverageBitrate(UNKNOWN_BITRATE);
-
-                final String language = recording.getString("language");
-                // If the language contains a - symbol, this means that the stream has an audio
-                // track with multiple languages, so there is no specific language for this stream
-                // Don't set the audio language in this case
-                if (language != null && !language.contains("-")) {
-                    builder.setAudioLocale(LocaleCompat.forLanguageTag(language).orElseThrow(() ->
-                        new ParsingException(
-                                "Cannot convert this language to a locale: " + language)
-                    ));
-                }
-
-                // Not checking containsSimilarStream here, since MediaCCC does not provide enough
-                // information to decide whether two streams are similar. Hence that method would
-                // always return false, e.g. even for different language variations.
-                audioStreams.add(builder.build());
+        for (final MediaCCCRecording.Audio recording : recordings) {
+            // First we need to resolve the actual video data from the CDN
+            final MediaFormat mediaFormat;
+            if (recording.mimeType.endsWith("opus")) {
+                mediaFormat = MediaFormat.OPUS;
+            } else if (recording.mimeType.endsWith("mpeg")) {
+                mediaFormat = MediaFormat.MP3;
+            } else if (recording.mimeType.endsWith("ogg")) {
+                mediaFormat = MediaFormat.OGG;
+            } else {
+                mediaFormat = null;
             }
+            audioStreams.add(new AudioStream.Builder()
+                    .setId(recording.filename)
+                    .setContent(recording.url, true)
+                    .setMediaFormat(mediaFormat)
+                    .setAverageBitrate(UNKNOWN_BITRATE)
+                    .setAudioLocale(recording.language)
+                    .build());
         }
         return audioStreams;
     }
 
     @Override
     public List<VideoStream> getVideoStreams() throws ExtractionException {
-        final JsonArray recordings = data.getArray("recordings");
+
+        final List<MediaCCCRecording.Video> recordings = getRecordings().stream()
+                .flatMap(r ->
+                        r instanceof MediaCCCRecording.Video
+                                ? Stream.of((MediaCCCRecording.Video) r)
+                                : Stream.empty()
+                )
+                .collect(Collectors.toList());
         final List<VideoStream> videoStreams = new ArrayList<>();
-        for (int i = 0; i < recordings.size(); i++) {
-            final JsonObject recording = recordings.getObject(i);
-            final String mimeType = recording.getString("mime_type");
-            if (mimeType.startsWith("video")) {
+        for (final MediaCCCRecording.Video recording : recordings) {
                 // First we need to resolve the actual video data from the CDN
                 final MediaFormat mediaFormat;
-                if (mimeType.endsWith("webm")) {
+                if (recording.mimeType.endsWith("webm")) {
                     mediaFormat = MediaFormat.WEBM;
-                } else if (mimeType.endsWith("mp4")) {
+                } else if (recording.mimeType.endsWith("mp4")) {
                     mediaFormat = MediaFormat.MPEG_4;
                 } else {
                     mediaFormat = null;
@@ -167,18 +162,119 @@ public List<VideoStream> getVideoStreams() throws ExtractionException {
                 // information to decide whether two streams are similar. Hence that method would
                 // always return false, e.g. even for different language variations.
                 videoStreams.add(new VideoStream.Builder()
-                        .setId(recording.getString("filename", ID_UNKNOWN))
-                        .setContent(recording.getString("recording_url"), true)
+                        .setId(recording.filename)
+                        .setContent(recording.url, true)
                         .setIsVideoOnly(false)
                         .setMediaFormat(mediaFormat)
-                        .setResolution(recording.getInt("height") + "p")
+                        .setResolution(recording.height + "p")
                         .build());
-            }
         }
 
         return videoStreams;
     }
 
+    public List<MediaCCCRecording> getRecordings() {
+        final JsonArray recordingsArray = data.getArray("recordings");
+        final List<MediaCCCRecording> recordings = new ArrayList<>();
+        for (int i = 0; i < recordingsArray.size(); i++) {
+            final JsonObject recording = recordingsArray.getObject(i);
+            final String mimeType = recording.getString("mime_type");
+            final String languages = recording.getString("language");
+            final String url = recording.getString("recording_url");
+
+            if (mimeType.startsWith("video/")) {
+                final MediaCCCRecording.Video v =
+                        new MediaCCCRecording.Video();
+                final String folder = recording.getString("folder");
+                v.filename = recording.getString("filename", ID_UNKNOWN);
+                // they will put the slides videos into the "slides" folder
+                v.recordingType = folder.contains("slides")
+                        ? MediaCCCRecording.VideoType.SLIDES
+                        : MediaCCCRecording.VideoType.MAIN;
+                v.mimeType = mimeType;
+                v.languages = Arrays.stream(languages.split("-"))
+                        .map(MediaCCCStreamExtractor::mediaCCCLanguageTagToLocale)
+                        .filter(l -> l != null)
+                        .collect(Collectors.toList());
+                v.url = url;
+                v.lengthSeconds = recording.getInt("length");
+                v.width = recording.getInt("width");
+                v.height = recording.getInt("height");
+                recordings.add(v);
+                continue;
+            }
+            if (mimeType.startsWith("audio/")) {
+                final MediaCCCRecording.Audio a =
+                        new MediaCCCRecording.Audio();
+                a.filename = recording.getString("filename", ID_UNKNOWN);
+                a.mimeType = mimeType;
+                a.language = mediaCCCLanguageTagToLocale(languages);
+                a.url = url;
+                a.lengthSeconds = recording.getInt("length");
+                recordings.add(a);
+                continue;
+            }
+            if (mimeType == "application/x-subrip") {
+                final MediaCCCRecording.Subtitle s =
+                        new MediaCCCRecording.Subtitle();
+                s.filename = recording.getString("filename", ID_UNKNOWN);
+                s.mimeType = mimeType;
+                s.language = mediaCCCLanguageTagToLocale(languages);
+                s.url = url;
+                recordings.add(s);
+                continue;
+            }
+            final String folder = recording.getString("folder");
+            if (mimeType.startsWith("application/") && folder.contains("slides")) {
+                final MediaCCCRecording.Slides s =
+                        new MediaCCCRecording.Slides();
+                s.filename = recording.getString("filename", ID_UNKNOWN);
+                s.mimeType = mimeType;
+                s.language = mediaCCCLanguageTagToLocale(languages);
+                s.url = url;
+                recordings.add(s);
+                continue;
+            }
+            final MediaCCCRecording.Unknown u =
+                    new MediaCCCRecording.Unknown();
+            u.filename = recording.getString("filename", ID_UNKNOWN);
+            u.mimeType = mimeType;
+            u.url = url;
+            u.rawObject = recording;
+            recordings.add(u);
+        }
+
+        return recordings;
+    }
+
+    /** Translate the media.ccc.de language tag to a Locale.
+     * The use the first three letters of the German word for the language.
+     * In case there’s still a `-` in the string, we’ll split on the first part.
+     * @param language language tag
+     * @return null if we don’t have that language in our switch, or Locale
+     */
+    private static @Nullable Locale mediaCCCLanguageTagToLocale(@Nonnull String language) {
+        final int idx = language.indexOf('-');
+        if (idx != -1) {
+            // TODO: would be cool if we could WARN here, but let’s just continue in case there’s still a separator
+            language = language.substring(0, idx);
+        }
+        switch (language) {
+            case "deu":
+                return Locale.GERMAN;
+            case "eng":
+                return Locale.ENGLISH;
+            case "fra":
+                return Locale.FRENCH;
+            case "ita":
+                return Locale.ITALIAN;
+            case "spa":
+                return Locale.forLanguageTag("es");
+            default:
+                return null;
+        }
+    }
+
     @Override
     public List<VideoStream> getVideoOnlyStreams() {
         return Collections.emptyList();

diff --git a/...va/org/schabi/newpipe/extractor/services/media_ccc/extractors/data/MediaCCCRecording.java b/...va/org/schabi/newpipe/extractor/services/media_ccc/extractors/data/MediaCCCRecording.java
@@ -0,0 +1,72 @@
+package org.schabi.newpipe.extractor.services.media_ccc.extractors.data;
+
+import com.grack.nanojson.JsonObject;
+
+import java.util.List;
+import java.util.Locale;
+
+import javax.annotation.Nullable;
+
+
+/** A recording stream of a talk/event. Switch on the implementation to get the actual data. */
+public interface MediaCCCRecording {
+
+    /** A recording stream of a talk/event.
+     * These files usually have one or more audio streams in different languages. */
+    class Video implements MediaCCCRecording {
+        public String filename;
+        public VideoType recordingType;
+        public String mimeType;
+        /** Each language is one separate audio track on the video. */
+        public List<Locale> languages;
+        public String url;
+        public int lengthSeconds;
+        public int width;
+        public int height;
+    }
+
+    /** Some talks have multiple kinds of video. */
+    enum VideoType {
+        /** The main recording of a talk/event. */
+        MAIN,
+        /** A side-recording of a talk/event that has the slides full-screen.
+         * Usually if there is a slide-recording there is a MAIN recording as well */
+        SLIDES
+    }
+
+    /** An audio recording of a talk/event.
+     * These audio streams are usually also available in their respective video streams.
+     */
+    class Audio implements MediaCCCRecording {
+        public String filename;
+        public String mimeType;
+        public @Nullable Locale language;
+        public String url;
+        public int lengthSeconds;
+    }
+
+    /** A subtitle file of a talk/event. */
+    class Subtitle implements MediaCCCRecording {
+        public String filename;
+        public String mimeType;
+        public @Nullable Locale language;
+        public String url;
+    }
+
+    /** The Slides of the talk, usually as PDF file. */
+    class Slides implements MediaCCCRecording {
+        public String filename;
+        public String mimeType;
+        public String url;
+        public @Nullable Locale language;
+    }
+
+    /** Anything we can’t put in any of the other categories. */
+    class Unknown implements MediaCCCRecording {
+        public String filename;
+        public String mimeType;
+        public String url;
+        /** The raw object for easier debugging. */
+        public JsonObject rawObject;
+    }
+}
diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/utils/Utils.java b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/Utils.java
@@ -257,6 +257,11 @@ public static String followGoogleRedirectIfNeeded(final String url) {
         return url;
     }
 
+    /**
+     * Check if the string is `null`, or the empty string.
+     * @param str string
+     * @return true if null or empty, false otherwise
+     */
     public static boolean isNullOrEmpty(final String str) {
         return str == null || str.isEmpty();
     }