Skip to content

Commit

Permalink
Add media.ccc specific parsing of recordings
Browse files Browse the repository at this point in the history
Instead of pushing the `media.ccc` recordings straight into the
StreamExtractor straightjacket (which does not really provide the
correct API for handling the semantics of the media.ccc data), we
parse into an intermediate structure first that contains all relevant
information about the different streams.

This is required for correct handling of the different languages.
  • Loading branch information
Profpatsch committed Dec 22, 2024
1 parent 8e92227 commit 4c2ff3c
Show file tree
Hide file tree
Showing 3 changed files with 224 additions and 51 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import org.schabi.newpipe.extractor.linkhandler.LinkHandler;
import org.schabi.newpipe.extractor.localization.DateWrapper;
import org.schabi.newpipe.extractor.localization.Localization;
import org.schabi.newpipe.extractor.services.media_ccc.extractors.data.MediaCCCRecording;
import org.schabi.newpipe.extractor.services.media_ccc.linkHandler.MediaCCCConferenceLinkHandlerFactory;
import org.schabi.newpipe.extractor.services.media_ccc.linkHandler.MediaCCCStreamLinkHandlerFactory;
import org.schabi.newpipe.extractor.stream.AudioStream;
Expand All @@ -28,15 +29,18 @@
import org.schabi.newpipe.extractor.stream.StreamType;
import org.schabi.newpipe.extractor.stream.VideoStream;
import org.schabi.newpipe.extractor.utils.JsonUtils;
import org.schabi.newpipe.extractor.utils.LocaleCompat;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

public class MediaCCCStreamExtractor extends StreamExtractor {
private JsonObject data;
Expand Down Expand Up @@ -100,64 +104,55 @@ public List<Image> getUploaderAvatars() {
}

@Override
public List<AudioStream> getAudioStreams() throws ExtractionException {
final JsonArray recordings = data.getArray("recordings");
public List<AudioStream> getAudioStreams() {
final List<MediaCCCRecording.Audio> recordings = getRecordings().stream()
.flatMap(r ->
r instanceof MediaCCCRecording.Audio
? Stream.of((MediaCCCRecording.Audio) r)
: Stream.empty()
)
.collect(Collectors.toList());
final List<AudioStream> audioStreams = new ArrayList<>();
for (int i = 0; i < recordings.size(); i++) {
final JsonObject recording = recordings.getObject(i);
final String mimeType = recording.getString("mime_type");
if (mimeType.startsWith("audio")) {
// First we need to resolve the actual video data from the CDN
final MediaFormat mediaFormat;
if (mimeType.endsWith("opus")) {
mediaFormat = MediaFormat.OPUS;
} else if (mimeType.endsWith("mpeg")) {
mediaFormat = MediaFormat.MP3;
} else if (mimeType.endsWith("ogg")) {
mediaFormat = MediaFormat.OGG;
} else {
mediaFormat = null;
}

final AudioStream.Builder builder = new AudioStream.Builder()
.setId(recording.getString("filename", ID_UNKNOWN))
.setContent(recording.getString("recording_url"), true)
.setMediaFormat(mediaFormat)
.setAverageBitrate(UNKNOWN_BITRATE);

final String language = recording.getString("language");
// If the language contains a - symbol, this means that the stream has an audio
// track with multiple languages, so there is no specific language for this stream
// Don't set the audio language in this case
if (language != null && !language.contains("-")) {
builder.setAudioLocale(LocaleCompat.forLanguageTag(language).orElseThrow(() ->
new ParsingException(
"Cannot convert this language to a locale: " + language)
));
}

// Not checking containsSimilarStream here, since MediaCCC does not provide enough
// information to decide whether two streams are similar. Hence that method would
// always return false, e.g. even for different language variations.
audioStreams.add(builder.build());
for (final MediaCCCRecording.Audio recording : recordings) {
// First we need to resolve the actual video data from the CDN
final MediaFormat mediaFormat;
if (recording.mimeType.endsWith("opus")) {
mediaFormat = MediaFormat.OPUS;
} else if (recording.mimeType.endsWith("mpeg")) {
mediaFormat = MediaFormat.MP3;
} else if (recording.mimeType.endsWith("ogg")) {
mediaFormat = MediaFormat.OGG;
} else {
mediaFormat = null;
}
audioStreams.add(new AudioStream.Builder()
.setId(recording.filename)
.setContent(recording.url, true)
.setMediaFormat(mediaFormat)
.setAverageBitrate(UNKNOWN_BITRATE)
.setAudioLocale(recording.language)
.build());
}
return audioStreams;
}

@Override
public List<VideoStream> getVideoStreams() throws ExtractionException {
final JsonArray recordings = data.getArray("recordings");

final List<MediaCCCRecording.Video> recordings = getRecordings().stream()
.flatMap(r ->
r instanceof MediaCCCRecording.Video
? Stream.of((MediaCCCRecording.Video) r)
: Stream.empty()
)
.collect(Collectors.toList());
final List<VideoStream> videoStreams = new ArrayList<>();
for (int i = 0; i < recordings.size(); i++) {
final JsonObject recording = recordings.getObject(i);
final String mimeType = recording.getString("mime_type");
if (mimeType.startsWith("video")) {
for (final MediaCCCRecording.Video recording : recordings) {
// First we need to resolve the actual video data from the CDN
final MediaFormat mediaFormat;
if (mimeType.endsWith("webm")) {
if (recording.mimeType.endsWith("webm")) {
mediaFormat = MediaFormat.WEBM;
} else if (mimeType.endsWith("mp4")) {
} else if (recording.mimeType.endsWith("mp4")) {
mediaFormat = MediaFormat.MPEG_4;
} else {
mediaFormat = null;
Expand All @@ -167,18 +162,119 @@ public List<VideoStream> getVideoStreams() throws ExtractionException {
// information to decide whether two streams are similar. Hence that method would
// always return false, e.g. even for different language variations.
videoStreams.add(new VideoStream.Builder()
.setId(recording.getString("filename", ID_UNKNOWN))
.setContent(recording.getString("recording_url"), true)
.setId(recording.filename)
.setContent(recording.url, true)
.setIsVideoOnly(false)
.setMediaFormat(mediaFormat)
.setResolution(recording.getInt("height") + "p")
.setResolution(recording.height + "p")
.build());
}
}

return videoStreams;
}

public List<MediaCCCRecording> getRecordings() {
final JsonArray recordingsArray = data.getArray("recordings");
final List<MediaCCCRecording> recordings = new ArrayList<>();
for (int i = 0; i < recordingsArray.size(); i++) {
final JsonObject recording = recordingsArray.getObject(i);
final String mimeType = recording.getString("mime_type");
final String languages = recording.getString("language");
final String url = recording.getString("recording_url");

if (mimeType.startsWith("video/")) {
final MediaCCCRecording.Video v =
new MediaCCCRecording.Video();
final String folder = recording.getString("folder");
v.filename = recording.getString("filename", ID_UNKNOWN);
// they will put the slides videos into the "slides" folder
v.recordingType = folder.contains("slides")
? MediaCCCRecording.VideoType.SLIDES
: MediaCCCRecording.VideoType.MAIN;
v.mimeType = mimeType;
v.languages = Arrays.stream(languages.split("-"))
.map(MediaCCCStreamExtractor::mediaCCCLanguageTagToLocale)
.filter(l -> l != null)
.collect(Collectors.toList());
v.url = url;
v.lengthSeconds = recording.getInt("length");
v.width = recording.getInt("width");
v.height = recording.getInt("height");
recordings.add(v);
continue;
}
if (mimeType.startsWith("audio/")) {
final MediaCCCRecording.Audio a =
new MediaCCCRecording.Audio();
a.filename = recording.getString("filename", ID_UNKNOWN);
a.mimeType = mimeType;
a.language = mediaCCCLanguageTagToLocale(languages);
a.url = url;
a.lengthSeconds = recording.getInt("length");
recordings.add(a);
continue;
}
if (mimeType == "application/x-subrip") {
final MediaCCCRecording.Subtitle s =
new MediaCCCRecording.Subtitle();
s.filename = recording.getString("filename", ID_UNKNOWN);
s.mimeType = mimeType;
s.language = mediaCCCLanguageTagToLocale(languages);
s.url = url;
recordings.add(s);
continue;
}
final String folder = recording.getString("folder");
if (mimeType.startsWith("application/") && folder.contains("slides")) {
final MediaCCCRecording.Slides s =
new MediaCCCRecording.Slides();
s.filename = recording.getString("filename", ID_UNKNOWN);
s.mimeType = mimeType;
s.language = mediaCCCLanguageTagToLocale(languages);
s.url = url;
recordings.add(s);
continue;
}
final MediaCCCRecording.Unknown u =
new MediaCCCRecording.Unknown();
u.filename = recording.getString("filename", ID_UNKNOWN);
u.mimeType = mimeType;
u.url = url;
u.rawObject = recording;
recordings.add(u);
}

return recordings;
}

/** Translate the media.ccc.de language tag to a Locale.
* The use the first three letters of the German word for the language.
* In case there’s still a `-` in the string, we’ll split on the first part.
* @param language language tag
* @return null if we don’t have that language in our switch, or Locale
*/
private static @Nullable Locale mediaCCCLanguageTagToLocale(@Nonnull String language) {
final int idx = language.indexOf('-');
if (idx != -1) {
// TODO: would be cool if we could WARN here, but let’s just continue in case there’s still a separator
language = language.substring(0, idx);
}
switch (language) {
case "deu":
return Locale.GERMAN;
case "eng":
return Locale.ENGLISH;
case "fra":
return Locale.FRENCH;
case "ita":
return Locale.ITALIAN;
case "spa":
return Locale.forLanguageTag("es");
default:
return null;
}
}

@Override
public List<VideoStream> getVideoOnlyStreams() {
return Collections.emptyList();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package org.schabi.newpipe.extractor.services.media_ccc.extractors.data;

import com.grack.nanojson.JsonObject;

import java.util.List;
import java.util.Locale;

import javax.annotation.Nullable;


/** A recording stream of a talk/event. Switch on the implementation to get the actual data. */
public interface MediaCCCRecording {

/** A recording stream of a talk/event.
* These files usually have one or more audio streams in different languages. */
class Video implements MediaCCCRecording {
public String filename;
public VideoType recordingType;
public String mimeType;
/** Each language is one separate audio track on the video. */
public List<Locale> languages;
public String url;
public int lengthSeconds;
public int width;
public int height;
}

/** Some talks have multiple kinds of video. */
enum VideoType {
/** The main recording of a talk/event. */
MAIN,
/** A side-recording of a talk/event that has the slides full-screen.
* Usually if there is a slide-recording there is a MAIN recording as well */
SLIDES
}

/** An audio recording of a talk/event.
* These audio streams are usually also available in their respective video streams.
*/
class Audio implements MediaCCCRecording {
public String filename;
public String mimeType;
public @Nullable Locale language;
public String url;
public int lengthSeconds;
}

/** A subtitle file of a talk/event. */
class Subtitle implements MediaCCCRecording {
public String filename;
public String mimeType;
public @Nullable Locale language;
public String url;
}

/** The Slides of the talk, usually as PDF file. */
class Slides implements MediaCCCRecording {
public String filename;
public String mimeType;
public String url;
public @Nullable Locale language;
}

/** Anything we can’t put in any of the other categories. */
class Unknown implements MediaCCCRecording {
public String filename;
public String mimeType;
public String url;
/** The raw object for easier debugging. */
public JsonObject rawObject;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,11 @@ public static String followGoogleRedirectIfNeeded(final String url) {
return url;
}

/**
* Check if the string is `null`, or the empty string.
* @param str string
* @return true if null or empty, false otherwise
*/
public static boolean isNullOrEmpty(final String str) {
return str == null || str.isEmpty();
}
Expand Down

0 comments on commit 4c2ff3c

Please sign in to comment.