Added ffprobe stuff back, also Link header support for OEmbed.

This commit is contained in:
flash 2024-10-28 18:32:02 +00:00
parent e0fa91d049
commit 148a136bd2

295
uiharu.ts
View file

@ -9,8 +9,9 @@ import { parseMediaType } from "jsr:@std/media-types";
import { Color } from "https://deno.land/x/color@v0.3.0/mod.ts";
// todo: these should not be hardcoded lol
const hostName: String = 'uiharu.edgii.net';
const port: Number = 3009;
const memcacheServer: String = "127.0.0.1:11211";
const memcacheServer: String = '127.0.0.1:11211';
const allowedOrigins: String[] = [
'edgii.net',
'chat.edgii.net',
@ -78,12 +79,22 @@ const allowOEmbed: String[] = [ // copied from wordpress source sorta
'.pinterest.com.au',
'.pinterest.com.mx',
'.wolframcloud.com',
'.instagram.com',
'.facebook.com',
'.pca.st',
'.anghami.com',
'.bsky.app',
'.apple.com',
'.flashii.net',
'.fii.moe',
'.tako.zone',
'.patchii.net',
'.railgun.sh',
'.flash.moe',
'.edgii.net',
];
const appVersion: String = '20241028';
const isDebug: Boolean = existsSync(pathJoin(import.meta.dirname, '.debug'));
const cache: MemcacheClient = new MemcacheClient({
server: memcacheServer,
@ -94,6 +105,21 @@ const cache: MemcacheClient = new MemcacheClient({
},
});
const uiharuFetch = async (url, init) => {
if(!init)
init = {};
if(!init.headers)
init.headers = {};
if(!init.headers['Accept'])
init.headers['Accept'] = 'text/html, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8';
if(!init.headers['Accept-Language'])
init.headers['Accept-Language'] = 'en-GB, en;q=0.9, ja-jp;q=0.6, *;q=0.5';
if(!init.headers['User-Agent'])
init.headers['User-Agent'] = `Mozilla/5.0 (compatible; Uiharu/${appVersion}; +http://fii.moe/uiharu)`;
return await fetch(url, init);
};
const readableStreamToString = async (stream?: ReadableStream): string => {
if(stream === null)
return '';
@ -313,31 +339,69 @@ const extractLinkedData = (html) => {
return values;
};
const extractOEmbedData = async (html, url: string, urlInfo: URL) => {
// TODO: this should also support header discovery
const parseLinkHeader = (header: string) => {
const links = [];
const lines = header.split(',');
for(const key in lines) {
const parts = lines[key].trim().split(';').map(part => part.trim());
let href = parts.shift();
if(typeof href !== 'string' || !href.startsWith('<') || !href.endsWith('>'))
continue;
href = decodeURI(href.slice(1, -1));
const link = {};
links.push(link);
for(const part of parts) {
const attr = part.split('=', 2);
let value = attr[1];
if(value.startsWith('"') && value.endsWith('"'))
value = value.slice(1, -1);
link[attr[0]] = value;
}
// applying this last to avoid tomfoolery :3
link.href = href;
}
return links;
};
const extractOEmbedData = async (response: Response, html, url: string, urlInfo: URL) => {
let oEmbedUrl: string = '';
// idk how long i'll bother with this for
if(urlInfo.host === 'x.com' || urlInfo.host === 'twitter.com')
oEmbedUrl = `https://publish.twitter.com/oembed?dnt=true&omit_script=true&url=${encodeURIComponent(url)}`;
else
else if(html !== undefined)
oEmbedUrl = html('link[rel="alternate"][type="application/json+oembed"]').first()?.attr('href')?.trim() ?? '';
if(oEmbedUrl === '') {
const links = parseLinkHeader(response.headers.get('link') ?? '');
for(const link of links)
if(link.rel === 'alternate' && link.type === 'application/json+oembed') {
oEmbedUrl = link.href;
break;
}
}
if(oEmbedUrl === '')
return {};
try {
return (await fetch(oEmbedUrl)).json();
return (await uiharuFetch(oEmbedUrl)).json();
} catch(ex) {
console.error(ex);
return {};
}
};
const extractMetadata = async (url: string, urlInfo: URL) => {
const data = await fetch(url);
const contentTypeRaw = data.headers.get('content-type') ?? '';
const extractMetadata = async (version: number, url: string, urlInfo: URL) => {
const response = await uiharuFetch(url);
const contentTypeRaw = response.headers.get('content-type') ?? '';
const contentType = parseMediaType(contentTypeRaw);
const info = {};
@ -353,8 +417,10 @@ const extractMetadata = async (url: string, urlInfo: URL) => {
if(contentType[0])
info.media_type = contentType[0];
let html = undefined;
if(['text/html', 'application/xhtml+xml'].includes(contentType[0])) {
const html = cheerio.load(await readableStreamToString(data.body));
html = cheerio.load(await readableStreamToString(response.body));
const metaData = extractHtmlMetaData(html);
const ogData = extractOpenGraphData(html);
@ -368,7 +434,7 @@ const extractMetadata = async (url: string, urlInfo: URL) => {
if(ogData.images?.length > 0) {
const image = ogData.images[0];
info.image = info.image_url = image.secure_url ?? image.url;
info.image_url = image.secure_url ?? image.url;
if(image.width > 0)
info.image_width = image.width;
if(image.height > 0)
@ -377,15 +443,9 @@ const extractMetadata = async (url: string, urlInfo: URL) => {
info.image_type = image.type;
if(image.alt)
info.image_alt = image.alt;
if(info.image_width > 0)
info.width = info.image_width;
if(info.image_height > 0)
info.height = info.image_height;
} else {
addInfoOrDont('image_url', twitterData.image ?? metaData.image ?? metaData.thumbnail);
addInfoOrDont('image_alt', twitterData.image_alt);
if(info.image_url)
info.image = info.image_url;
}
if(ogData.audios?.length > 0) {
@ -406,36 +466,191 @@ const extractMetadata = async (url: string, urlInfo: URL) => {
info.video_type = video.type;
if(video.tags?.length > 0)
info.video_tags = video.tags;
if(info.video_width > 0)
info.width = info.video_width;
if(info.video_height > 0)
info.height = info.video_height;
} else {
addInfoOrDont('video_url', twitterData.player);
addInfoOrDont('video_width', twitterData.player_width);
addInfoOrDont('video_height', twitterData.player_height);
}
if(version < 2) {
info.image = info.image_url;
if(info.video_width > 0)
info.width = info.video_width;
else if(info.image_width > 0)
info.width = info.image_width;
if(info.video_height > 0)
info.height = info.video_height;
else if(info.image_height > 0)
info.height = info.image_height;
}
const linkedDatas = extractLinkedData(html);
if(linkedDatas.length > 0)
info.lds = linkedDatas;
if(isAllowedOEmbedDomain(urlInfo.host)) {
const oEmbedData = await extractOEmbedData(html, url, urlInfo);
if(oEmbedData.version)
info.oembed = oEmbedData;
}
// idk what to do with this yet, only including this in debug mode for now
if(isDebug && linkedDatas.length > 0)
info._lds = linkedDatas;
} else {
if(contentType[0].startsWith('image/')) {
//
} else if(contentType[0].startsWith('video/')) {
//
} else if(contentType[0].startsWith('audio/')) {
//
const isAudio = contentType[0].startsWith('audio/');
const isImage = contentType[0].startsWith('image/');
const isVideo = contentType[0].startsWith('video/');
if(isAudio || isImage || isVideo) {
// this still seems like a terrible idea lol
const { code, stdout, stderr } = await (new Deno.Command('ffprobe', {
stdin: 'null',
stdout: 'piped',
stderr: 'piped',
args: [
'-show_streams',
'-show_format',
'-print_format', 'json',
'-v', 'quiet',
'-i', url
],
})).output();
if(code !== 0) {
console.error(new TextDecoder().decode(stderr));
} else {
const probe = JSON.parse(new TextDecoder().decode(stdout).trim());
if(isDebug)
info._ffprobe = probe;
if(typeof probe?.format === 'object') {
const media = {};
info.media = media;
media.confidence = Math.min(1, Math.max(0, probe.format.probe_score / 100.0));
const pfDuration = parseFloat(probe.format.duration);
if(!isNaN(pfDuration))
media.duration = pfDuration;
const pfSize = parseInt(probe.format.size);
if(!isNaN(pfSize))
media.size = pfSize;
const pfBitRate = parseInt(probe.format.bit_rate);
if(!isNaN(pfBitRate)) {
if(version < 2)
media.bitRate = pfBitRate;
else
media.bitrate = pfBitRate;
}
// in Title case cus JS doesnt have an accessible lcfirst equivalent :p
const pftFields = ['Title', 'Artist', 'Album', 'Date', 'Comment', 'Genre'];
if(Array.isArray(probe.streams))
for(const stream of probe.streams)
if(stream.codec_type === 'video') {
media.width = stream.coded_width ?? stream.width ?? 0;
media.height = stream.coded_height ?? stream.height ?? 0;
if(typeof stream.display_aspect_ratio === 'string') {
if(version < 2)
media.aspectRatio = stream.display_aspect_ratio;
else
media.aspect_ratio = stream.display_aspect_ratio;
}
} else if(stream.codec_type === 'audio') {
if(typeof stream.tags === 'object')
for(const pftFieldName of pftFields) {
const pftFieldValue = stream.tags[pftFieldName]
?? probe.format.tags[pftFieldName.toLowerCase()]
?? probe.format.tags[pftFieldName.toUpperCase()];
if(typeof pftFieldValue === 'string') {
if(typeof media.tags !== 'object')
media.tags = {};
media.tags[pftFieldName.toLowerCase()] = pftFieldValue;
}
}
}
if(typeof probe.format.tags === 'object')
for(const pftFieldName of pftFields) {
const pftFieldValue = probe.format.tags[pftFieldName]
?? probe.format.tags[pftFieldName.toLowerCase()]
?? probe.format.tags[pftFieldName.toUpperCase()];
if(typeof pftFieldValue === 'string') {
if(typeof media.tags !== 'object')
media.tags = {};
media.tags[pftFieldName.toLowerCase()] = pftFieldValue;
}
}
}
}
if(isAudio) {
info.audio_url = url;
info.image_url = `${version < 2 ? '' : 'https:'}//${hostName}/metadata/thumb/audio?url=${encodeURIComponent(url)}`;
info.image_type = 'image/png';
let title = '';
if(typeof info.media.tags.artist === 'string')
title += `${info.media.tags.artist} - `;
if(typeof info.media.tags.title === 'string')
title += info.media.tags.title;
if(typeof info.media.tags.date === 'string')
title += ` (${info.media.tags.date})`;
title = title.trim();
if(title !== '')
info.title = title;
if(typeof info.media.tags.comment === 'string')
info.description = info.media.tags.comment.trim();
} else if(isImage) {
info.image_url = url;
info.image_type = info.media_type;
if(info.media.width > 0)
info.width = info.image_width = info.media.width;
if(info.media.height > 0)
info.height = info.image_height = info.media.height;
} else if(isVideo) {
info.video_url = url;
info.image_url = `${version < 2 ? '' : 'https:'}//${hostName}/metadata/thumb/video?url=${encodeURIComponent(url)}`;
info.image_type = 'image/png';
if(info.media.width > 0)
info.image_width = info.width = info.video_width = info.media.width;
if(info.media.height > 0)
info.image_height = info.height = info.video_height = info.media.height;
}
if(version < 2) {
info.image = info.image_url;
if(isAudio)
info.is_audio = true;
else if(isImage)
info.is_image = true;
else if(isVideo)
info.is_video = true;
}
}
}
if(isAllowedOEmbedDomain(urlInfo.host)) {
const oEmbedData = await extractOEmbedData(response, html, url, urlInfo);
if(oEmbedData.version)
info.oembed = oEmbedData;
}
if(version < 2 && info.video_url) {
if(info.video_url.startsWith('https://www.youtube.com/')) {
const ytVidUrl = new URL(info.video_url);
const ytVidUrlParams = new URLSearchParams(ytVidUrl.search);
info.type = 'youtube:video';
info.youtube_video_id = basename(ytVidUrl.pathname);
if(ytVidUrlParams.has('list'))
info.youtube_playlist = ytVidUrlParams.get('list');
} else if(info.video_url.startsWith('https://embed.nicovideo.jp/')) {
const nndVidUrl = new URL(info.video_url);
info.type = 'niconico:video';
info.nicovideo_video_id = basename(nndVidUrl.pathname);
}
}
@ -470,6 +685,7 @@ const requestHandler = async (req: Request): Response => {
return new Response('', { status: 405, headers });
const started = performance.now();
const urlParams = new URLSearchParams(url.search);
headers['Content-Type'] = 'application/json;charset=utf-8';
@ -477,7 +693,7 @@ const requestHandler = async (req: Request): Response => {
if(req.method === 'POST')
urlParamRaw = (await readableStreamToString(req.body)).trim();
else
urlParamRaw = (new URLSearchParams(url.search)).get('url')?.trim() ?? '';
urlParamRaw = urlParams.get('url')?.trim() ?? '';
if(urlParamRaw === '')
return new Response('{"error":"metadata:uri"}', { status: 400, headers });
@ -493,10 +709,15 @@ const requestHandler = async (req: Request): Response => {
urlParamRaw = urlParam.toString();
const formatVersion = parseInt(urlParams.get('fv')) || 1;
if(formatVersion < 1 || formatVersion > 2)
return new Response('{"error":"metadata:version"}', { status: 400, headers });
const urlHash = encodeBase64Url(
await crypto.subtle.digest('SHA-256', new TextEncoder().encode(urlParamRaw))
);
const cacheKey = `uiharu:metadata:${urlHash}`;
const cacheKey = `uiharu:metadata:fv${formatVersion}:${urlHash}`;
// const cacheInfo = await cache.get(cacheKey);
// if(cacheInfo !== undefined)
// return new Response(
@ -515,7 +736,7 @@ const requestHandler = async (req: Request): Response => {
try {
const json = JSON.stringify(
await extractMetadata(urlParamRaw, urlParam)
await extractMetadata(formatVersion, urlParamRaw, urlParam)
);
cache.set(cacheKey, brotliCompressSync(json), {