Use NodeJS/Cheerio for HTML parsing.

This commit is contained in:
Pachira 2023-10-27 22:25:17 +00:00
parent d90e1e1c0b
commit 6a2060c7a6
5 changed files with 301 additions and 89 deletions

1
.gitignore vendored
View file

@ -6,3 +6,4 @@
/public/robots.txt
/lib/index-dev
/vendor
/node_modules

75
extract.mjs Normal file
View file

@ -0,0 +1,75 @@
import * as cheerio from 'cheerio';
const readStdIn = () => {
return new Promise((resolve, reject) => {
let stdIn = '';
process.stdin.on('data', data => stdIn += data);
process.stdin.on('end', () => resolve(stdIn));
process.stdin.on('error', err => reject(err));
});
};
const $ = cheerio.load(await readStdIn());
const info = {
title: '',
metaTitle: '',
desc: '',
siteName: '',
image: '',
colour: '',
type: 'website',
};
const titleTag = $('title').first();
if(titleTag.length > 0)
info.title = titleTag.text().trim();
const metaTags = $('meta');
for(const elemInfo of metaTags) {
const elem = $(elemInfo);
const nameAttr = (elem.attr('name') ?? elem.attr('property') ?? '').trim();
if(nameAttr === '')
continue;
const valueAttr = (elem.attr('value') ?? elem.attr('content') ?? '').trim();
switch(nameAttr) {
case 'og:title':
case 'twitter:title':
if(info.metaTitle === '' || valueAttr.length > info.metaTitle.length)
info.metaTitle = valueAttr;
break;
case 'description':
case 'og:description':
case 'twitter:description':
if(info.desc === '' || valueAttr.length > info.desc.length)
info.desc = valueAttr;
break;
case 'og:site_name':
if(info.siteName === '')
info.siteName = valueAttr;
break;
case 'og:image':
case 'twitter:image':
if(info.image === '')
info.image = valueAttr;
break;
case 'theme-color':
if(info.colour === '')
info.colour = valueAttr;
break;
case 'og:type':
if(info.type === '')
info.type = `website:${valueAttr}`;
break;
}
}
console.log(JSON.stringify(info));

193
package-lock.json generated Normal file
View file

@ -0,0 +1,193 @@
{
"name": "uiharu.edgii.net",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"dependencies": {
"cheerio": "^1.0.0-rc.12"
}
},
"node_modules/boolbase": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
"integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="
},
"node_modules/cheerio": {
"version": "1.0.0-rc.12",
"resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.12.tgz",
"integrity": "sha512-VqR8m68vM46BNnuZ5NtnGBKIE/DfN0cRIzg9n40EIq9NOv90ayxLBXA8fXC5gquFRGJSTRqBq25Jt2ECLR431Q==",
"dependencies": {
"cheerio-select": "^2.1.0",
"dom-serializer": "^2.0.0",
"domhandler": "^5.0.3",
"domutils": "^3.0.1",
"htmlparser2": "^8.0.1",
"parse5": "^7.0.0",
"parse5-htmlparser2-tree-adapter": "^7.0.0"
},
"engines": {
"node": ">= 6"
},
"funding": {
"url": "https://github.com/cheeriojs/cheerio?sponsor=1"
}
},
"node_modules/cheerio-select": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-2.1.0.tgz",
"integrity": "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==",
"dependencies": {
"boolbase": "^1.0.0",
"css-select": "^5.1.0",
"css-what": "^6.1.0",
"domelementtype": "^2.3.0",
"domhandler": "^5.0.3",
"domutils": "^3.0.1"
},
"funding": {
"url": "https://github.com/sponsors/fb55"
}
},
"node_modules/css-select": {
"version": "5.1.0",
"resolved": "https://registry.npmjs.org/css-select/-/css-select-5.1.0.tgz",
"integrity": "sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==",
"dependencies": {
"boolbase": "^1.0.0",
"css-what": "^6.1.0",
"domhandler": "^5.0.2",
"domutils": "^3.0.1",
"nth-check": "^2.0.1"
},
"funding": {
"url": "https://github.com/sponsors/fb55"
}
},
"node_modules/css-what": {
"version": "6.1.0",
"resolved": "https://registry.npmjs.org/css-what/-/css-what-6.1.0.tgz",
"integrity": "sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==",
"engines": {
"node": ">= 6"
},
"funding": {
"url": "https://github.com/sponsors/fb55"
}
},
"node_modules/dom-serializer": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz",
"integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
"dependencies": {
"domelementtype": "^2.3.0",
"domhandler": "^5.0.2",
"entities": "^4.2.0"
},
"funding": {
"url": "https://github.com/cheeriojs/dom-serializer?sponsor=1"
}
},
"node_modules/domelementtype": {
"version": "2.3.0",
"resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz",
"integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/fb55"
}
]
},
"node_modules/domhandler": {
"version": "5.0.3",
"resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz",
"integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
"dependencies": {
"domelementtype": "^2.3.0"
},
"engines": {
"node": ">= 4"
},
"funding": {
"url": "https://github.com/fb55/domhandler?sponsor=1"
}
},
"node_modules/domutils": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/domutils/-/domutils-3.1.0.tgz",
"integrity": "sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==",
"dependencies": {
"dom-serializer": "^2.0.0",
"domelementtype": "^2.3.0",
"domhandler": "^5.0.3"
},
"funding": {
"url": "https://github.com/fb55/domutils?sponsor=1"
}
},
"node_modules/entities": {
"version": "4.5.0",
"resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz",
"integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==",
"engines": {
"node": ">=0.12"
},
"funding": {
"url": "https://github.com/fb55/entities?sponsor=1"
}
},
"node_modules/htmlparser2": {
"version": "8.0.2",
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz",
"integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==",
"funding": [
"https://github.com/fb55/htmlparser2?sponsor=1",
{
"type": "github",
"url": "https://github.com/sponsors/fb55"
}
],
"dependencies": {
"domelementtype": "^2.3.0",
"domhandler": "^5.0.3",
"domutils": "^3.0.1",
"entities": "^4.4.0"
}
},
"node_modules/nth-check": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz",
"integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==",
"dependencies": {
"boolbase": "^1.0.0"
},
"funding": {
"url": "https://github.com/fb55/nth-check?sponsor=1"
}
},
"node_modules/parse5": {
"version": "7.1.2",
"resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz",
"integrity": "sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw==",
"dependencies": {
"entities": "^4.4.0"
},
"funding": {
"url": "https://github.com/inikulin/parse5?sponsor=1"
}
},
"node_modules/parse5-htmlparser2-tree-adapter": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.0.0.tgz",
"integrity": "sha512-B77tOZrqqfUfnVcOrUvfdLbz4pu4RopLD/4vmu3HUPswwTA8OH0EMW9BlWR2B0RCoiZRAHEUu7IxeP1Pd1UU+g==",
"dependencies": {
"domhandler": "^5.0.2",
"parse5": "^7.0.0"
},
"funding": {
"url": "https://github.com/inikulin/parse5?sponsor=1"
}
}
}
}

5
package.json Normal file
View file

@ -0,0 +1,5 @@
{
"dependencies": {
"cheerio": "^1.0.0-rc.12"
}
}

View file

@ -124,99 +124,37 @@ final class WebLookup implements \Uiharu\ILookup {
$body = self::reqBody($req);
self::reqClose($req);
$charSet = $mediaType->getCharset();
$urlHost = $url->getHost();
$charSetWrangle = function(string $input) use ($charSet, $urlHost): string {
// fuck it
if($urlHost === 'pixiv.net' || $urlHost === 'www.pixiv.net') {
$decoded = mb_convert_encoding($input, 'ISO-8859-1', 'UTF-8');
if(mb_check_encoding($decoded, 'UTF-8') && str_repeat('?', strlen($decoded)) !== $decoded)
return $decoded;
}
// ok hear me out
// there's absolutely no good html scraping libraries for PHP
// DOMDocument Exists but kinda blows at catching weird encoding events like with pixiv
// and i'm not about to rewrite this whole fucking thing in nodejs
// also at this point Index should probably provide a wrapper for proc_open lol
$extract = proc_open(
sprintf('node %s/extract.mjs', UIH_ROOT),
[0 => ['pipe', 'r'], 1 => ['pipe', 'w'], 2 => ['pipe', 'w']],
$pipes
);
if(!is_resource($extract))
throw new RuntimeException('Could not open extract.');
return $input;
};
try {
fwrite($pipes[0], $body);
fclose($pipes[0]);
$document = new DOMDocument;
if($isXHTML) {
$document->loadXML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING);
} else {
$document->encoding = $charSet;
$document->loadHTML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING);
foreach($document->childNodes as $child)
if($child->nodeType === XML_PI_NODE) {
$document->removeChild($child);
break;
}
$stderr = trim(stream_get_contents($pipes[2]));
if(!empty($stderr))
throw new RuntimeException('extract: ' . $stderr);
$stdout = trim(stream_get_contents($pipes[1]));
if(empty($stdout))
throw new RuntimeException('extract did not report any errors but exited without any output');
} finally {
proc_close($extract);
}
$siteInfo = new stdClass;
$siteInfo->title = '';
$siteInfo->metaTitle = '';
$siteInfo->desc = '';
$siteInfo->siteName = '';
$siteInfo->image = '';
$siteInfo->colour = '';
$siteInfo->type = 'website';
$titleTag = $document->getElementsByTagName('title');
foreach($titleTag as $tag) {
$siteInfo->title = $charSetWrangle(trim($tag->textContent));
break;
}
$metaTags = $document->getElementsByTagName('meta');
foreach($metaTags as $tag) {
$nameAttr = $tag->hasAttribute('name') ? $tag->getAttribute('name') : (
$tag->hasAttribute('property') ? $tag->getAttribute('property') : ''
);
$valueAttr = $tag->hasAttribute('value') ? $tag->getAttribute('value') : (
$tag->hasAttribute('content') ? $tag->getAttribute('content') : ''
);
$nameAttr = trim(mb_convert_encoding($nameAttr, 'utf-8', $charSet));
$valueAttr = trim(mb_convert_encoding($valueAttr, 'utf-8', $charSet));
if(empty($nameAttr) || empty($valueAttr))
continue;
$nameAttr = $charSetWrangle($nameAttr);
$valueAttr = $charSetWrangle($valueAttr);
switch($nameAttr) {
case 'og:title':
case 'twitter:title':
if(empty($siteInfo->metaTitle) || strlen($valueAttr) > strlen($siteInfo->metaTitle))
$siteInfo->metaTitle = $valueAttr;
break;
case 'description':
case 'og:description':
case 'twitter:description':
if(empty($siteInfo->desc) || strlen($valueAttr) > strlen($siteInfo->desc))
$siteInfo->desc = $valueAttr;
break;
case 'og:site_name':
if(empty($siteInfo->siteName))
$siteInfo->siteName = $valueAttr;
break;
case 'og:image':
case 'twitter:image':
if(empty($siteInfo->image))
$siteInfo->image = $valueAttr;
break;
case 'theme-color':
if(empty($siteInfo->colour))
$siteInfo->colour = $valueAttr;
break;
case 'og:type':
if(empty($siteInfo->type))
$siteInfo->type = 'website:' . $valueAttr;
break;
}
}
$siteInfo = json_decode($stdout);
if(empty($siteInfo))
throw new RuntimeException('Failed to parse extract output.');
return new WebLookupSiteResult($url, $mediaType, $siteInfo);
}