From 4c2ff3c98f54e3509984860d33cbe7676c4d21f2 Mon Sep 17 00:00:00 2001 From: Profpatsch Date: Sat, 21 Dec 2024 02:43:42 +0100 Subject: [PATCH] Add media.ccc specific parsing of recordings Instead of pushing the `media.ccc` recordings straight into the StreamExtractor straightjacket (which does not really provide the correct API for handling the semantics of the media.ccc data), we parse into an intermediate structure first that contains all relevant information about the different streams. This is required for correct handling of the different languages. --- .../extractors/MediaCCCStreamExtractor.java | 198 +++++++++++++----- .../extractors/data/MediaCCCRecording.java | 72 +++++++ .../schabi/newpipe/extractor/utils/Utils.java | 5 + 3 files changed, 224 insertions(+), 51 deletions(-) create mode 100644 extractor/src/main/java/org/schabi/newpipe/extractor/services/media_ccc/extractors/data/MediaCCCRecording.java diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/media_ccc/extractors/MediaCCCStreamExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/media_ccc/extractors/MediaCCCStreamExtractor.java index 99ddf5e08c..daf686332f 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/media_ccc/extractors/MediaCCCStreamExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/media_ccc/extractors/MediaCCCStreamExtractor.java @@ -20,6 +20,7 @@ import org.schabi.newpipe.extractor.linkhandler.LinkHandler; import org.schabi.newpipe.extractor.localization.DateWrapper; import org.schabi.newpipe.extractor.localization.Localization; +import org.schabi.newpipe.extractor.services.media_ccc.extractors.data.MediaCCCRecording; import org.schabi.newpipe.extractor.services.media_ccc.linkHandler.MediaCCCConferenceLinkHandlerFactory; import org.schabi.newpipe.extractor.services.media_ccc.linkHandler.MediaCCCStreamLinkHandlerFactory; import org.schabi.newpipe.extractor.stream.AudioStream; @@ -28,15 +29,18 @@ import org.schabi.newpipe.extractor.stream.StreamType; import org.schabi.newpipe.extractor.stream.VideoStream; import org.schabi.newpipe.extractor.utils.JsonUtils; -import org.schabi.newpipe.extractor.utils.LocaleCompat; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Locale; +import java.util.stream.Collectors; +import java.util.stream.Stream; import javax.annotation.Nonnull; +import javax.annotation.Nullable; public class MediaCCCStreamExtractor extends StreamExtractor { private JsonObject data; @@ -100,64 +104,55 @@ public List getUploaderAvatars() { } @Override - public List getAudioStreams() throws ExtractionException { - final JsonArray recordings = data.getArray("recordings"); + public List getAudioStreams() { + final List recordings = getRecordings().stream() + .flatMap(r -> + r instanceof MediaCCCRecording.Audio + ? Stream.of((MediaCCCRecording.Audio) r) + : Stream.empty() + ) + .collect(Collectors.toList()); final List audioStreams = new ArrayList<>(); - for (int i = 0; i < recordings.size(); i++) { - final JsonObject recording = recordings.getObject(i); - final String mimeType = recording.getString("mime_type"); - if (mimeType.startsWith("audio")) { - // First we need to resolve the actual video data from the CDN - final MediaFormat mediaFormat; - if (mimeType.endsWith("opus")) { - mediaFormat = MediaFormat.OPUS; - } else if (mimeType.endsWith("mpeg")) { - mediaFormat = MediaFormat.MP3; - } else if (mimeType.endsWith("ogg")) { - mediaFormat = MediaFormat.OGG; - } else { - mediaFormat = null; - } - - final AudioStream.Builder builder = new AudioStream.Builder() - .setId(recording.getString("filename", ID_UNKNOWN)) - .setContent(recording.getString("recording_url"), true) - .setMediaFormat(mediaFormat) - .setAverageBitrate(UNKNOWN_BITRATE); - - final String language = recording.getString("language"); - // If the language contains a - symbol, this means that the stream has an audio - // track with multiple languages, so there is no specific language for this stream - // Don't set the audio language in this case - if (language != null && !language.contains("-")) { - builder.setAudioLocale(LocaleCompat.forLanguageTag(language).orElseThrow(() -> - new ParsingException( - "Cannot convert this language to a locale: " + language) - )); - } - - // Not checking containsSimilarStream here, since MediaCCC does not provide enough - // information to decide whether two streams are similar. Hence that method would - // always return false, e.g. even for different language variations. - audioStreams.add(builder.build()); + for (final MediaCCCRecording.Audio recording : recordings) { + // First we need to resolve the actual video data from the CDN + final MediaFormat mediaFormat; + if (recording.mimeType.endsWith("opus")) { + mediaFormat = MediaFormat.OPUS; + } else if (recording.mimeType.endsWith("mpeg")) { + mediaFormat = MediaFormat.MP3; + } else if (recording.mimeType.endsWith("ogg")) { + mediaFormat = MediaFormat.OGG; + } else { + mediaFormat = null; } + audioStreams.add(new AudioStream.Builder() + .setId(recording.filename) + .setContent(recording.url, true) + .setMediaFormat(mediaFormat) + .setAverageBitrate(UNKNOWN_BITRATE) + .setAudioLocale(recording.language) + .build()); } return audioStreams; } @Override public List getVideoStreams() throws ExtractionException { - final JsonArray recordings = data.getArray("recordings"); + + final List recordings = getRecordings().stream() + .flatMap(r -> + r instanceof MediaCCCRecording.Video + ? Stream.of((MediaCCCRecording.Video) r) + : Stream.empty() + ) + .collect(Collectors.toList()); final List videoStreams = new ArrayList<>(); - for (int i = 0; i < recordings.size(); i++) { - final JsonObject recording = recordings.getObject(i); - final String mimeType = recording.getString("mime_type"); - if (mimeType.startsWith("video")) { + for (final MediaCCCRecording.Video recording : recordings) { // First we need to resolve the actual video data from the CDN final MediaFormat mediaFormat; - if (mimeType.endsWith("webm")) { + if (recording.mimeType.endsWith("webm")) { mediaFormat = MediaFormat.WEBM; - } else if (mimeType.endsWith("mp4")) { + } else if (recording.mimeType.endsWith("mp4")) { mediaFormat = MediaFormat.MPEG_4; } else { mediaFormat = null; @@ -167,18 +162,119 @@ public List getVideoStreams() throws ExtractionException { // information to decide whether two streams are similar. Hence that method would // always return false, e.g. even for different language variations. videoStreams.add(new VideoStream.Builder() - .setId(recording.getString("filename", ID_UNKNOWN)) - .setContent(recording.getString("recording_url"), true) + .setId(recording.filename) + .setContent(recording.url, true) .setIsVideoOnly(false) .setMediaFormat(mediaFormat) - .setResolution(recording.getInt("height") + "p") + .setResolution(recording.height + "p") .build()); - } } return videoStreams; } + public List getRecordings() { + final JsonArray recordingsArray = data.getArray("recordings"); + final List recordings = new ArrayList<>(); + for (int i = 0; i < recordingsArray.size(); i++) { + final JsonObject recording = recordingsArray.getObject(i); + final String mimeType = recording.getString("mime_type"); + final String languages = recording.getString("language"); + final String url = recording.getString("recording_url"); + + if (mimeType.startsWith("video/")) { + final MediaCCCRecording.Video v = + new MediaCCCRecording.Video(); + final String folder = recording.getString("folder"); + v.filename = recording.getString("filename", ID_UNKNOWN); + // they will put the slides videos into the "slides" folder + v.recordingType = folder.contains("slides") + ? MediaCCCRecording.VideoType.SLIDES + : MediaCCCRecording.VideoType.MAIN; + v.mimeType = mimeType; + v.languages = Arrays.stream(languages.split("-")) + .map(MediaCCCStreamExtractor::mediaCCCLanguageTagToLocale) + .filter(l -> l != null) + .collect(Collectors.toList()); + v.url = url; + v.lengthSeconds = recording.getInt("length"); + v.width = recording.getInt("width"); + v.height = recording.getInt("height"); + recordings.add(v); + continue; + } + if (mimeType.startsWith("audio/")) { + final MediaCCCRecording.Audio a = + new MediaCCCRecording.Audio(); + a.filename = recording.getString("filename", ID_UNKNOWN); + a.mimeType = mimeType; + a.language = mediaCCCLanguageTagToLocale(languages); + a.url = url; + a.lengthSeconds = recording.getInt("length"); + recordings.add(a); + continue; + } + if (mimeType == "application/x-subrip") { + final MediaCCCRecording.Subtitle s = + new MediaCCCRecording.Subtitle(); + s.filename = recording.getString("filename", ID_UNKNOWN); + s.mimeType = mimeType; + s.language = mediaCCCLanguageTagToLocale(languages); + s.url = url; + recordings.add(s); + continue; + } + final String folder = recording.getString("folder"); + if (mimeType.startsWith("application/") && folder.contains("slides")) { + final MediaCCCRecording.Slides s = + new MediaCCCRecording.Slides(); + s.filename = recording.getString("filename", ID_UNKNOWN); + s.mimeType = mimeType; + s.language = mediaCCCLanguageTagToLocale(languages); + s.url = url; + recordings.add(s); + continue; + } + final MediaCCCRecording.Unknown u = + new MediaCCCRecording.Unknown(); + u.filename = recording.getString("filename", ID_UNKNOWN); + u.mimeType = mimeType; + u.url = url; + u.rawObject = recording; + recordings.add(u); + } + + return recordings; + } + + /** Translate the media.ccc.de language tag to a Locale. + * The use the first three letters of the German word for the language. + * In case there’s still a `-` in the string, we’ll split on the first part. + * @param language language tag + * @return null if we don’t have that language in our switch, or Locale + */ + private static @Nullable Locale mediaCCCLanguageTagToLocale(@Nonnull String language) { + final int idx = language.indexOf('-'); + if (idx != -1) { + // TODO: would be cool if we could WARN here, but let’s just continue in case there’s still a separator + language = language.substring(0, idx); + } + switch (language) { + case "deu": + return Locale.GERMAN; + case "eng": + return Locale.ENGLISH; + case "fra": + return Locale.FRENCH; + case "ita": + return Locale.ITALIAN; + case "spa": + return Locale.forLanguageTag("es"); + default: + return null; + } + } + @Override public List getVideoOnlyStreams() { return Collections.emptyList(); diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/media_ccc/extractors/data/MediaCCCRecording.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/media_ccc/extractors/data/MediaCCCRecording.java new file mode 100644 index 0000000000..16f3bf5192 --- /dev/null +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/media_ccc/extractors/data/MediaCCCRecording.java @@ -0,0 +1,72 @@ +package org.schabi.newpipe.extractor.services.media_ccc.extractors.data; + +import com.grack.nanojson.JsonObject; + +import java.util.List; +import java.util.Locale; + +import javax.annotation.Nullable; + + +/** A recording stream of a talk/event. Switch on the implementation to get the actual data. */ +public interface MediaCCCRecording { + + /** A recording stream of a talk/event. + * These files usually have one or more audio streams in different languages. */ + class Video implements MediaCCCRecording { + public String filename; + public VideoType recordingType; + public String mimeType; + /** Each language is one separate audio track on the video. */ + public List languages; + public String url; + public int lengthSeconds; + public int width; + public int height; + } + + /** Some talks have multiple kinds of video. */ + enum VideoType { + /** The main recording of a talk/event. */ + MAIN, + /** A side-recording of a talk/event that has the slides full-screen. + * Usually if there is a slide-recording there is a MAIN recording as well */ + SLIDES + } + + /** An audio recording of a talk/event. + * These audio streams are usually also available in their respective video streams. + */ + class Audio implements MediaCCCRecording { + public String filename; + public String mimeType; + public @Nullable Locale language; + public String url; + public int lengthSeconds; + } + + /** A subtitle file of a talk/event. */ + class Subtitle implements MediaCCCRecording { + public String filename; + public String mimeType; + public @Nullable Locale language; + public String url; + } + + /** The Slides of the talk, usually as PDF file. */ + class Slides implements MediaCCCRecording { + public String filename; + public String mimeType; + public String url; + public @Nullable Locale language; + } + + /** Anything we can’t put in any of the other categories. */ + class Unknown implements MediaCCCRecording { + public String filename; + public String mimeType; + public String url; + /** The raw object for easier debugging. */ + public JsonObject rawObject; + } +} diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/utils/Utils.java b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/Utils.java index c061ce30fa..d79dc3c04f 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/utils/Utils.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/Utils.java @@ -257,6 +257,11 @@ public static String followGoogleRedirectIfNeeded(final String url) { return url; } + /** + * Check if the string is `null`, or the empty string. + * @param str string + * @return true if null or empty, false otherwise + */ public static boolean isNullOrEmpty(final String str) { return str == null || str.isEmpty(); }