From 148a136bd21d23aefe46a4898457abef5db855f0 Mon Sep 17 00:00:00 2001 From: flashwave Date: Mon, 28 Oct 2024 18:32:02 +0000 Subject: [PATCH] Added ffprobe stuff back, also Link header support for OEmbed. --- uiharu.ts | 295 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 258 insertions(+), 37 deletions(-) diff --git a/uiharu.ts b/uiharu.ts index 46f06a6..d0a1940 100644 --- a/uiharu.ts +++ b/uiharu.ts @@ -9,8 +9,9 @@ import { parseMediaType } from "jsr:@std/media-types"; import { Color } from "https://deno.land/x/color@v0.3.0/mod.ts"; // todo: these should not be hardcoded lol +const hostName: String = 'uiharu.edgii.net'; const port: Number = 3009; -const memcacheServer: String = "127.0.0.1:11211"; +const memcacheServer: String = '127.0.0.1:11211'; const allowedOrigins: String[] = [ 'edgii.net', 'chat.edgii.net', @@ -78,12 +79,22 @@ const allowOEmbed: String[] = [ // copied from wordpress source sorta '.pinterest.com.au', '.pinterest.com.mx', '.wolframcloud.com', + '.instagram.com', + '.facebook.com', '.pca.st', '.anghami.com', '.bsky.app', '.apple.com', + '.flashii.net', + '.fii.moe', + '.tako.zone', + '.patchii.net', + '.railgun.sh', + '.flash.moe', + '.edgii.net', ]; +const appVersion: String = '20241028'; const isDebug: Boolean = existsSync(pathJoin(import.meta.dirname, '.debug')); const cache: MemcacheClient = new MemcacheClient({ server: memcacheServer, @@ -94,6 +105,21 @@ const cache: MemcacheClient = new MemcacheClient({ }, }); +const uiharuFetch = async (url, init) => { + if(!init) + init = {}; + if(!init.headers) + init.headers = {}; + if(!init.headers['Accept']) + init.headers['Accept'] = 'text/html, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8'; + if(!init.headers['Accept-Language']) + init.headers['Accept-Language'] = 'en-GB, en;q=0.9, ja-jp;q=0.6, *;q=0.5'; + if(!init.headers['User-Agent']) + init.headers['User-Agent'] = `Mozilla/5.0 (compatible; Uiharu/${appVersion}; +http://fii.moe/uiharu)`; + + return await fetch(url, init); +}; + const readableStreamToString = async (stream?: ReadableStream): string => { if(stream === null) return ''; @@ -313,31 +339,69 @@ const extractLinkedData = (html) => { return values; }; -const extractOEmbedData = async (html, url: string, urlInfo: URL) => { - // TODO: this should also support header discovery +const parseLinkHeader = (header: string) => { + const links = []; + const lines = header.split(','); + for(const key in lines) { + const parts = lines[key].trim().split(';').map(part => part.trim()); + + let href = parts.shift(); + if(typeof href !== 'string' || !href.startsWith('<') || !href.endsWith('>')) + continue; + + href = decodeURI(href.slice(1, -1)); + const link = {}; + links.push(link); + + for(const part of parts) { + const attr = part.split('=', 2); + let value = attr[1]; + if(value.startsWith('"') && value.endsWith('"')) + value = value.slice(1, -1); + + link[attr[0]] = value; + } + + // applying this last to avoid tomfoolery :3 + link.href = href; + } + + return links; +}; + +const extractOEmbedData = async (response: Response, html, url: string, urlInfo: URL) => { let oEmbedUrl: string = ''; // idk how long i'll bother with this for if(urlInfo.host === 'x.com' || urlInfo.host === 'twitter.com') oEmbedUrl = `https://publish.twitter.com/oembed?dnt=true&omit_script=true&url=${encodeURIComponent(url)}`; - else + else if(html !== undefined) oEmbedUrl = html('link[rel="alternate"][type="application/json+oembed"]').first()?.attr('href')?.trim() ?? ''; + if(oEmbedUrl === '') { + const links = parseLinkHeader(response.headers.get('link') ?? ''); + for(const link of links) + if(link.rel === 'alternate' && link.type === 'application/json+oembed') { + oEmbedUrl = link.href; + break; + } + } + if(oEmbedUrl === '') return {}; try { - return (await fetch(oEmbedUrl)).json(); + return (await uiharuFetch(oEmbedUrl)).json(); } catch(ex) { console.error(ex); return {}; } }; -const extractMetadata = async (url: string, urlInfo: URL) => { - const data = await fetch(url); - const contentTypeRaw = data.headers.get('content-type') ?? ''; +const extractMetadata = async (version: number, url: string, urlInfo: URL) => { + const response = await uiharuFetch(url); + const contentTypeRaw = response.headers.get('content-type') ?? ''; const contentType = parseMediaType(contentTypeRaw); const info = {}; @@ -353,8 +417,10 @@ const extractMetadata = async (url: string, urlInfo: URL) => { if(contentType[0]) info.media_type = contentType[0]; + let html = undefined; + if(['text/html', 'application/xhtml+xml'].includes(contentType[0])) { - const html = cheerio.load(await readableStreamToString(data.body)); + html = cheerio.load(await readableStreamToString(response.body)); const metaData = extractHtmlMetaData(html); const ogData = extractOpenGraphData(html); @@ -368,7 +434,7 @@ const extractMetadata = async (url: string, urlInfo: URL) => { if(ogData.images?.length > 0) { const image = ogData.images[0]; - info.image = info.image_url = image.secure_url ?? image.url; + info.image_url = image.secure_url ?? image.url; if(image.width > 0) info.image_width = image.width; if(image.height > 0) @@ -377,15 +443,9 @@ const extractMetadata = async (url: string, urlInfo: URL) => { info.image_type = image.type; if(image.alt) info.image_alt = image.alt; - if(info.image_width > 0) - info.width = info.image_width; - if(info.image_height > 0) - info.height = info.image_height; } else { addInfoOrDont('image_url', twitterData.image ?? metaData.image ?? metaData.thumbnail); addInfoOrDont('image_alt', twitterData.image_alt); - if(info.image_url) - info.image = info.image_url; } if(ogData.audios?.length > 0) { @@ -406,36 +466,191 @@ const extractMetadata = async (url: string, urlInfo: URL) => { info.video_type = video.type; if(video.tags?.length > 0) info.video_tags = video.tags; - if(info.video_width > 0) - info.width = info.video_width; - if(info.video_height > 0) - info.height = info.video_height; } else { addInfoOrDont('video_url', twitterData.player); addInfoOrDont('video_width', twitterData.player_width); addInfoOrDont('video_height', twitterData.player_height); + } + + if(version < 2) { + info.image = info.image_url; if(info.video_width > 0) info.width = info.video_width; + else if(info.image_width > 0) + info.width = info.image_width; if(info.video_height > 0) info.height = info.video_height; + else if(info.image_height > 0) + info.height = info.image_height; } const linkedDatas = extractLinkedData(html); - if(linkedDatas.length > 0) - info.lds = linkedDatas; - - if(isAllowedOEmbedDomain(urlInfo.host)) { - const oEmbedData = await extractOEmbedData(html, url, urlInfo); - if(oEmbedData.version) - info.oembed = oEmbedData; - } + // idk what to do with this yet, only including this in debug mode for now + if(isDebug && linkedDatas.length > 0) + info._lds = linkedDatas; } else { - if(contentType[0].startsWith('image/')) { - // - } else if(contentType[0].startsWith('video/')) { - // - } else if(contentType[0].startsWith('audio/')) { - // + const isAudio = contentType[0].startsWith('audio/'); + const isImage = contentType[0].startsWith('image/'); + const isVideo = contentType[0].startsWith('video/'); + + if(isAudio || isImage || isVideo) { + // this still seems like a terrible idea lol + const { code, stdout, stderr } = await (new Deno.Command('ffprobe', { + stdin: 'null', + stdout: 'piped', + stderr: 'piped', + args: [ + '-show_streams', + '-show_format', + '-print_format', 'json', + '-v', 'quiet', + '-i', url + ], + })).output(); + + if(code !== 0) { + console.error(new TextDecoder().decode(stderr)); + } else { + const probe = JSON.parse(new TextDecoder().decode(stdout).trim()); + if(isDebug) + info._ffprobe = probe; + + if(typeof probe?.format === 'object') { + const media = {}; + info.media = media; + media.confidence = Math.min(1, Math.max(0, probe.format.probe_score / 100.0)); + + const pfDuration = parseFloat(probe.format.duration); + if(!isNaN(pfDuration)) + media.duration = pfDuration; + + const pfSize = parseInt(probe.format.size); + if(!isNaN(pfSize)) + media.size = pfSize; + + const pfBitRate = parseInt(probe.format.bit_rate); + if(!isNaN(pfBitRate)) { + if(version < 2) + media.bitRate = pfBitRate; + else + media.bitrate = pfBitRate; + } + + // in Title case cus JS doesnt have an accessible lcfirst equivalent :p + const pftFields = ['Title', 'Artist', 'Album', 'Date', 'Comment', 'Genre']; + + if(Array.isArray(probe.streams)) + for(const stream of probe.streams) + if(stream.codec_type === 'video') { + media.width = stream.coded_width ?? stream.width ?? 0; + media.height = stream.coded_height ?? stream.height ?? 0; + + if(typeof stream.display_aspect_ratio === 'string') { + if(version < 2) + media.aspectRatio = stream.display_aspect_ratio; + else + media.aspect_ratio = stream.display_aspect_ratio; + } + } else if(stream.codec_type === 'audio') { + if(typeof stream.tags === 'object') + for(const pftFieldName of pftFields) { + const pftFieldValue = stream.tags[pftFieldName] + ?? probe.format.tags[pftFieldName.toLowerCase()] + ?? probe.format.tags[pftFieldName.toUpperCase()]; + + if(typeof pftFieldValue === 'string') { + if(typeof media.tags !== 'object') + media.tags = {}; + + media.tags[pftFieldName.toLowerCase()] = pftFieldValue; + } + } + } + + if(typeof probe.format.tags === 'object') + for(const pftFieldName of pftFields) { + const pftFieldValue = probe.format.tags[pftFieldName] + ?? probe.format.tags[pftFieldName.toLowerCase()] + ?? probe.format.tags[pftFieldName.toUpperCase()]; + + if(typeof pftFieldValue === 'string') { + if(typeof media.tags !== 'object') + media.tags = {}; + + media.tags[pftFieldName.toLowerCase()] = pftFieldValue; + } + } + } + } + + if(isAudio) { + info.audio_url = url; + info.image_url = `${version < 2 ? '' : 'https:'}//${hostName}/metadata/thumb/audio?url=${encodeURIComponent(url)}`; + info.image_type = 'image/png'; + + let title = ''; + if(typeof info.media.tags.artist === 'string') + title += `${info.media.tags.artist} - `; + if(typeof info.media.tags.title === 'string') + title += info.media.tags.title; + if(typeof info.media.tags.date === 'string') + title += ` (${info.media.tags.date})`; + title = title.trim(); + if(title !== '') + info.title = title; + + if(typeof info.media.tags.comment === 'string') + info.description = info.media.tags.comment.trim(); + } else if(isImage) { + info.image_url = url; + info.image_type = info.media_type; + + if(info.media.width > 0) + info.width = info.image_width = info.media.width; + if(info.media.height > 0) + info.height = info.image_height = info.media.height; + } else if(isVideo) { + info.video_url = url; + info.image_url = `${version < 2 ? '' : 'https:'}//${hostName}/metadata/thumb/video?url=${encodeURIComponent(url)}`; + info.image_type = 'image/png'; + + if(info.media.width > 0) + info.image_width = info.width = info.video_width = info.media.width; + if(info.media.height > 0) + info.image_height = info.height = info.video_height = info.media.height; + } + + if(version < 2) { + info.image = info.image_url; + + if(isAudio) + info.is_audio = true; + else if(isImage) + info.is_image = true; + else if(isVideo) + info.is_video = true; + } + } + } + + if(isAllowedOEmbedDomain(urlInfo.host)) { + const oEmbedData = await extractOEmbedData(response, html, url, urlInfo); + if(oEmbedData.version) + info.oembed = oEmbedData; + } + + if(version < 2 && info.video_url) { + if(info.video_url.startsWith('https://www.youtube.com/')) { + const ytVidUrl = new URL(info.video_url); + const ytVidUrlParams = new URLSearchParams(ytVidUrl.search); + info.type = 'youtube:video'; + info.youtube_video_id = basename(ytVidUrl.pathname); + if(ytVidUrlParams.has('list')) + info.youtube_playlist = ytVidUrlParams.get('list'); + } else if(info.video_url.startsWith('https://embed.nicovideo.jp/')) { + const nndVidUrl = new URL(info.video_url); + info.type = 'niconico:video'; + info.nicovideo_video_id = basename(nndVidUrl.pathname); } } @@ -470,6 +685,7 @@ const requestHandler = async (req: Request): Response => { return new Response('', { status: 405, headers }); const started = performance.now(); + const urlParams = new URLSearchParams(url.search); headers['Content-Type'] = 'application/json;charset=utf-8'; @@ -477,7 +693,7 @@ const requestHandler = async (req: Request): Response => { if(req.method === 'POST') urlParamRaw = (await readableStreamToString(req.body)).trim(); else - urlParamRaw = (new URLSearchParams(url.search)).get('url')?.trim() ?? ''; + urlParamRaw = urlParams.get('url')?.trim() ?? ''; if(urlParamRaw === '') return new Response('{"error":"metadata:uri"}', { status: 400, headers }); @@ -493,10 +709,15 @@ const requestHandler = async (req: Request): Response => { urlParamRaw = urlParam.toString(); + const formatVersion = parseInt(urlParams.get('fv')) || 1; + + if(formatVersion < 1 || formatVersion > 2) + return new Response('{"error":"metadata:version"}', { status: 400, headers }); + const urlHash = encodeBase64Url( await crypto.subtle.digest('SHA-256', new TextEncoder().encode(urlParamRaw)) ); - const cacheKey = `uiharu:metadata:${urlHash}`; + const cacheKey = `uiharu:metadata:fv${formatVersion}:${urlHash}`; // const cacheInfo = await cache.get(cacheKey); // if(cacheInfo !== undefined) // return new Response( @@ -515,7 +736,7 @@ const requestHandler = async (req: Request): Response => { try { const json = JSON.stringify( - await extractMetadata(urlParamRaw, urlParam) + await extractMetadata(formatVersion, urlParamRaw, urlParam) ); cache.set(cacheKey, brotliCompressSync(json), {