uiharu/uiharu.ts

666 lines
21 KiB
TypeScript
Raw Normal View History

2024-10-28 03:40:26 +00:00
import * as cheerio from 'npm:cheerio@^1.0.0';
import { MemcacheClient } from 'npm:memcache-client@^1.0.5';
import { existsSync } from "jsr:@std/fs";
import { basename, join as pathJoin } from "jsr:@std/path";
import { normalize as pathNormalize } from "jsr:@std/path/normalize";
import { encodeBase64Url } from "jsr:@std/encoding/base64url";
import { brotliCompressSync, brotliDecompressSync } from "node:zlib";
import { parseMediaType } from "jsr:@std/media-types";
import { Color } from "https://deno.land/x/color@v0.3.0/mod.ts";
// todo: these should not be hardcoded lol
const port: Number = 3009;
const memcacheServer: String = "127.0.0.1:11211";
const allowedOrigins: String[] = [
'edgii.net',
'chat.edgii.net',
'sockchat.edgii.net',
'ajaxchat.edgii.net',
];
const allowOEmbed: String[] = [ // copied from wordpress source sorta
'.youtube.com',
'.youtu.be',
'.vimeo.com',
'.dailymotion.com',
'.dai.ly',
'.flickr.com',
'.flic.kr',
'.smugmug.com',
'.scribd.com',
'.wordpress.tv',
'.crowdsignal.net',
'.polldaddy.com',
'.poll.fm',
'.survey.fm',
'.twitter.com',
'.soundcloud.com',
'.spotify.com',
'.imgur.com',
'.issuu.com',
'.mixcloud.com',
'.ted.com',
'.animoto.com',
'.video214.com',
'.tumblr.com',
'.kickstarter.com',
'.kck.st',
'.cloudup.com',
'.reverbnation.com',
'.videopress.com',
'.reddit.com',
'.speakerdeck.com',
'.screencast.com',
'.amazon.com',
'.amazon.com.mx',
'.amazon.com.br',
'.amazon.ca',
'.amazon.co.uk',
'.amazon.de',
'.amazon.fr',
'.amazon.it',
'.amazon.es',
'.amazon.in',
'.amazon.nl',
'.amazon.ru',
'.amazon.co.jp',
'.amazon.com.au',
'.amazon.cn',
'.a.co',
'.amzn.to',
'.amzn.eu',
'.amzn.in',
'.amzn.asia',
'.z.cn',
'.somecards.com',
'.some.ly',
'.tiktok.com',
'.pinterest.com',
'.pinterest.com.au',
'.pinterest.com.mx',
'.wolframcloud.com',
'.pca.st',
'.anghami.com',
'.bsky.app',
'.apple.com',
];
const isDebug: Boolean = existsSync(pathJoin(import.meta.dirname, '.debug'));
const cache: MemcacheClient = new MemcacheClient({
server: memcacheServer,
compressor: {
// fuck it lol
compressSync: buffer => buffer,
decompressSync: buffer => buffer,
},
});
const readableStreamToString = async (stream?: ReadableStream): string => {
if(stream === null)
return '';
const reader = stream.getReader();
const decoder = new TextDecoder;
let result = '';
for(;;) {
const { done, value } = await reader.read();
if(done) break;
result += decoder.decode(value, { stream: true });
}
result += decoder.decode();
return result;
};
const isAllowedOEmbedDomain = (domain: string): Boolean => {
if(!domain.startsWith('.'))
domain = '.' + domain;
for(const suffix of allowOEmbed)
if(domain.endsWith(suffix))
return true;
return false;
};
const extractHtmlMetaData = (html) => {
const values = {};
const titleTag = html('title')?.first()?.text().trim() ?? '';
if(titleTag.length > 0)
values.title = titleTag;
const metaDescriptionTag = html('meta[name="description"]').first()?.attr('content')?.trim() ?? '';
if(metaDescriptionTag.length > 0)
values.description = metaDescriptionTag;
const metaThumbnailTag = html('meta[name="thumbnail"]').first()?.attr('content')?.trim() ?? '';
if(metaThumbnailTag.length > 0)
values.thumbnail = metaThumbnailTag;
const metaThemeColorTag = html('meta[name="theme-color"]').first()?.attr('content')?.trim() ?? '';
if(metaThemeColorTag.length > 0)
values.theme_color = Color.string(metaThemeColorTag).hex();
const linkImageSrcTag = html('link[rel="image_src"]').first()?.attr('href')?.trim() ?? '';
if(linkImageSrcTag.length > 0)
values.image = linkImageSrcTag;
const linkCanonicalTag = html('link[rel="canonical"]').first()?.attr('href')?.trim() ?? '';
if(linkCanonicalTag.length > 0)
values.canonical_url = linkCanonicalTag;
return values;
};
const extractOpenGraphData = (html) => {
const values = {};
// this is hateful
const properties = {
'url': { type: 'url' },
'type': { type: 'str' },
'title': { type: 'str' },
'locale': { type: 'str' },
'locale:alternate': { type: 'str', array: 'locales' },
'description': { type: 'str' },
'determiner': { type: 'str' },
'site_name': { type: 'str' },
'image': { alias: 'image:url', array: 'images' },
'image:url': { of: 'image', type: 'url' },
'image:secure_url': { of: 'image', type: 'url', protos: ['https:'] },
'image:type': { of: 'image', type: 'mime' },
'image:width': { of: 'image', type: 'int' },
'image:height': { of: 'image', type: 'int' },
'image:alt': { of: 'image', type: 'string' },
'video': { alias: 'video:url', array: 'videos' },
'video:url': { of: 'video', type: 'url' },
'video:secure_url': { of: 'video', type: 'url', protos: ['https:'] },
'video:type': { of: 'video', type: 'mime' },
'video:width': { of: 'video', type: 'int' },
'video:height': { of: 'video', type: 'int' },
'video:tag': { of: 'video', type: 'str', array: 'tags' },
'audio': { alias: 'audio:url', array: 'audios' },
'audio:url': { of: 'audio', type: 'url' },
'audio:secure_url': { of: 'audio', type: 'url', protos: ['https:'] },
'audio:type': { of: 'audio', type: 'mime' },
};
const tags = html('meta[property^="og:"]');
for(const tagInfo of tags) {
const tag = html(tagInfo);
let name = (tag.attr('property')?.trim() ?? '').substring(3);
if(!(name in properties))
continue;
let value = tag.attr('content')?.trim() ?? '';
let propInfo = properties[name];
let target = values;
if(propInfo.alias) {
name = propInfo.alias;
propInfo = properties[name];
}
if(propInfo.of) {
name = name.substring(propInfo.of.length + 1);
const objInfo = properties[propInfo.of];
if(objInfo.array) {
if(objInfo.array in target)
target = target[objInfo.array];
else
target = target[objInfo.array] = [];
const lastItem = target[target.length - 1];
if(lastItem === undefined || name in lastItem) {
const newItem = {};
target.push(newItem);
target = newItem;
} else
target = lastItem;
} else {
if(!(name in target))
target[name] = {};
target = target[name];
}
}
if(propInfo.array) {
if(propInfo.array in target)
target = target[propInfo.array];
else
target = target[propInfo.array] = [];
} else if(name in target)
continue;
if(propInfo.type === 'int')
value = parseInt(value);
else {
if(propInfo.type === 'mime') {
// world's most naive validation
if(value.indexOf('/') < 0)
value = undefined;
} else if(propInfo.type === 'url') {
try {
const protos = propInfo.protos ?? ['https:', 'http:'];
if(!protos.includes(new URL(value).protocol))
value = undefined;
} catch(ex) {
console.error(ex);
value = undefined;
}
} else if(propInfo.type !== 'str')
value = undefined;
}
if(value) {
if(propInfo.array)
target.push(value);
else
target[name] = value;
}
}
return values;
};
const extractTwitterData = (html) => {
const values = {};
const properties = [
'card',
'site',
'site:id',
'creator',
'creator:id',
'description',
'title',
'image',
'image:alt',
'player',
'player:width',
'player:height',
'player:stream',
];
for(const property of properties) {
const tag = html(`meta[name="twitter:${property}"]`)?.first()?.attr('content')?.trim() ?? '';
if(tag.length > 0)
values[property.replace(':', '_')] = tag;
}
return values;
};
const extractLinkedData = (html) => {
const values = [];
const tags = html('script[type="application/ld+json"]');
for(const tagInfo of tags)
try {
values.push(JSON.parse(html(tagInfo).text().trim()));
} catch(ex) {
console.error(ex);
}
return values;
};
const extractOEmbedData = async (html, url: string, urlInfo: URL) => {
// TODO: this should also support header discovery
let oEmbedUrl: string = '';
// idk how long i'll bother with this for
if(urlInfo.host === 'x.com' || urlInfo.host === 'twitter.com')
oEmbedUrl = `https://publish.twitter.com/oembed?dnt=true&omit_script=true&url=${encodeURIComponent(url)}`;
else
oEmbedUrl = html('link[rel="alternate"][type="application/json+oembed"]').first()?.attr('href')?.trim() ?? '';
if(oEmbedUrl === '')
return {};
try {
return (await fetch(oEmbedUrl)).json();
} catch(ex) {
console.error(ex);
return {};
}
};
const extractMetadata = async (url: string, urlInfo: URL) => {
const data = await fetch(url);
const contentTypeRaw = data.headers.get('content-type') ?? '';
const contentType = parseMediaType(contentTypeRaw);
const info = {};
const addInfoOrDont = (prop, value) => {
if(value !== null && value !== undefined)
info[prop] = value;
};
info.url = url;
info.title = decodeURIComponent(basename(urlInfo.pathname));
info.site_name = urlInfo.host;
if(contentType[0])
2024-10-28 03:40:26 +00:00
info.media_type = contentType[0];
if(['text/html', 'application/xhtml+xml'].includes(contentType[0])) {
const html = cheerio.load(await readableStreamToString(data.body));
const metaData = extractHtmlMetaData(html);
const ogData = extractOpenGraphData(html);
const twitterData = extractTwitterData(html);
addInfoOrDont('url', ogData.url ?? metaData.canonical_url);
2024-10-28 03:40:26 +00:00
addInfoOrDont('title', ogData.title ?? twitterData.title ?? metaData.title);
addInfoOrDont('site_name', ogData.site_name);
addInfoOrDont('description', ogData.description ?? twitterData.description ?? metaData.description);
addInfoOrDont('color', metaData.theme_color);
if(ogData.images?.length > 0) {
const image = ogData.images[0];
info.image = info.image_url = image.secure_url ?? image.url;
if(image.width > 0)
info.image_width = image.width;
if(image.height > 0)
info.image_height = image.height;
if(image.type)
info.image_type = image.type;
if(image.alt)
info.image_alt = image.alt;
if(info.image_width > 0)
info.width = info.image_width;
if(info.image_height > 0)
info.height = info.image_height;
} else {
addInfoOrDont('image_url', twitterData.image ?? metaData.image ?? metaData.thumbnail);
addInfoOrDont('image_alt', twitterData.image_alt);
if(info.image_url)
info.image = info.image_url;
}
if(ogData.audios?.length > 0) {
const audio = ogData.audios[0];
info.audio_url = audio.secure_url ?? audio.url;
if(audio.type)
info.audio_type = audio.type;
}
if(ogData.videos?.length > 0) {
const video = ogData.videos[0];
info.video_url = video.secure_url ?? video.url;
if(video.width > 0)
info.video_width = video.width;
if(video.height > 0)
info.video_height = video.height;
if(video.type)
info.video_type = video.type;
if(video.tags?.length > 0)
info.video_tags = video.tags;
if(info.video_width > 0)
info.width = info.video_width;
if(info.video_height > 0)
info.height = info.video_height;
} else {
addInfoOrDont('video_url', twitterData.player);
addInfoOrDont('video_width', twitterData.player_width);
addInfoOrDont('video_height', twitterData.player_height);
if(info.video_width > 0)
info.width = info.video_width;
if(info.video_height > 0)
info.height = info.video_height;
}
const linkedDatas = extractLinkedData(html);
if(linkedDatas.length > 0)
info.lds = linkedDatas;
if(isAllowedOEmbedDomain(urlInfo.host)) {
const oEmbedData = await extractOEmbedData(html, url, urlInfo);
if(oEmbedData.version)
info.oembed = oEmbedData;
}
} else {
if(contentType[0].startsWith('image/')) {
//
} else if(contentType[0].startsWith('video/')) {
//
} else if(contentType[0].startsWith('audio/')) {
//
}
}
return info;
};
const requestHandler = async (req: Request): Response => {
const url = new URL(req.url);
const headers = { 'X-Powered-By': 'Uiharu' };
if(req.headers.has('origin')) {
const originRaw = req.headers.get('origin');
const origin = new URL(originRaw);
if(!allowedOrigins.includes(origin.host))
return new Response('403', { status: 403, headers });
headers['Access-Control-Allow-Origin'] = originRaw;
headers['Vary'] = 'Origin';
}
if(req.method === 'OPTIONS') {
headers['Allow'] = 'OPTIONS, GET, HEAD, POST';
headers['Access-Control-Allow-Methods'] = 'OPTIONS, GET, HEAD, POST';
// idk if this is the appropriate status code but: balls
return new Response('', { status: 204, headers });
}
if(url.pathname === '/metadata') {
if(!['GET', 'HEAD', 'POST'].includes(req.method))
return new Response('', { status: 405, headers });
const started = performance.now();
headers['Content-Type'] = 'application/json;charset=utf-8';
let urlParamRaw: String = '';
if(req.method === 'POST')
urlParamRaw = (await readableStreamToString(req.body)).trim();
else
urlParamRaw = (new URLSearchParams(url.search)).get('url')?.trim() ?? '';
if(urlParamRaw === '')
return new Response('{"error":"metadata:uri"}', { status: 400, headers });
if(urlParamRaw.startsWith('//'))
urlParamRaw = 'https:' + urlParamRaw;
let urlParam: URL;
try {
urlParam = new URL(urlParamRaw);
} catch(ex) {
return new Response('{"error":"metadata:uri"}', { status: 400, headers });
}
urlParamRaw = urlParam.toString();
const urlHash = encodeBase64Url(
await crypto.subtle.digest('SHA-256', new TextEncoder().encode(urlParamRaw))
);
const cacheKey = `uiharu:metadata:${urlHash}`;
// const cacheInfo = await cache.get(cacheKey);
// if(cacheInfo !== undefined)
// return new Response(
// brotliDecompressSync(cacheInfo.value),
// {
// status: 200,
// headers: {
// ...headers,
// ...{
// 'Server-Timing': `metadata;dur=${(performance.now() - started).toFixed(6)}`,
// 'X-Uiharu-State': 'cache',
// },
// },
// }
// );
try {
const json = JSON.stringify(
await extractMetadata(urlParamRaw, urlParam)
);
cache.set(cacheKey, brotliCompressSync(json), {
compress: false,
lifetime: 600
});
return new Response(json, {
status: 200,
headers: {
...headers,
...{
'Server-Timing': `metadata;dur=${(performance.now() - started).toFixed(6)}`,
'X-Uiharu-State': 'fresh',
},
},
});
} catch(ex) {
console.error(ex);
return new Response('{"error":"metadata:lookup"}', { status: 500, headers });
}
}
if(url.pathname === '/metadata/batch') {
if(!['GET', 'HEAD', 'POST'].includes(req.method))
return new Response('', { status: 405, headers });
return new Response('{"took":0,"results":[]}', {
headers: {
...headers,
...{ 'Content-Type': 'application/json' },
},
});
}
const isAudio = url.pathname === '/metadata/thumb/audio';
const isVideo = url.pathname === '/metadata/thumb/video';
if(isAudio || isVideo) {
if(!['HEAD', 'GET'].includes(req.method))
return new Response('', { status: 405, headers });
let urlParamRaw: String = (new URLSearchParams(url.search)).get('url')?.trim() ?? '';
if(urlParamRaw === '')
return new Response('missing url parameter', { status: 400, headers });
let scheme: String = '';
try {
const urlParam = new URL(urlParamRaw);
if(typeof urlParam.protocol === 'string')
scheme = urlParam.protocol;
urlParamRaw = urlParam.toString();
} catch(ex) {
return new Response('invalid url parameter', { status: 400, headers });
}
if(!['http:', 'https:'].includes(scheme))
return new Response('unsupported url scheme', { status: 400, headers });
// this seems like a terrible idea lol
const args = ['-i', urlParamRaw];
if(isAudio) args.push('-an');
args.push('-f');
args.push('image2pipe');
args.push('-c:v');
args.push(isVideo ? 'png' : 'copy');
args.push('-frames:v');
args.push('1');
args.push('-');
const { code, stdout, stderr } = await (new Deno.Command('ffmpeg', {
stdin: 'null',
stdout: 'piped',
stderr: 'piped',
args,
})).output();
if(code !== 0) {
console.error(new TextDecoder().decode(stderr));
return new Response('decode failed', { status: 500, headers });
}
// TODO: bother with cache someday maybe
const thumb = stdout;
return new Response(thumb, {
headers: {
...headers,
...{
'Content-Type': 'image/png',
'Cache-Control': 'public, max-age=31536000, immutable',
},
},
});
}
// serving files from /public dir
if(['HEAD', 'GET'].includes(req.method)) {
const localPathPrefix = import.meta.dirname + '/public/';
const localPathSuffix = pathNormalize(url.pathname === '/' ? '/index.html' : url.pathname);
const localPath = pathNormalize(localPathPrefix + localPathSuffix);
if(localPath.startsWith(localPathPrefix) && existsSync(localPath)) {
const mediaTypes = {
'html': 'text/html;charset=utf-8',
'css': 'text/css;charset=utf-8',
'txt': 'text/plain;charset=utf-8',
'png': 'image/png',
};
let mediaType: String = 'application/octet-stream';
const dotIndex = localPathSuffix.lastIndexOf('.');
if(dotIndex >= 0) {
const ext = localPathSuffix.substring(dotIndex + 1);
if(ext in mediaTypes)
mediaType = mediaTypes[ext];
}
return new Response('', {
status: 200,
headers: {
...headers,
...{
'Content-Type': mediaType,
'X-Accel-Redirect': `/_public${localPathSuffix}`,
}
},
});
}
// 404 page
return new Response('<!doctype html><meta charset=utf-8><title>404 Not Found</title><h1>404 Not Found</h1>', {
status: 404,
headers: {
...headers,
...{ 'Content-Type': 'text/html;charset=utf-8' },
},
});
}
// 404 fallback
return new Response('', {
status: ['OPTIONS', 'HEAD', 'GET', 'POST'].includes(req.method) ? 404 : 405,
headers,
});
};
Deno.serve({ port }, requestHandler);