Fixed issues caused by websites lying about their character set.
This commit is contained in:
parent
1f256a40ca
commit
842e91a413
1 changed files with 13 additions and 2 deletions
|
@ -125,6 +125,15 @@ final class WebLookup implements \Uiharu\ILookup {
|
||||||
self::reqClose($req);
|
self::reqClose($req);
|
||||||
|
|
||||||
$charSet = $mediaType->getCharset();
|
$charSet = $mediaType->getCharset();
|
||||||
|
$charSetWrangle = function(string $input) use ($charSet): string {
|
||||||
|
if(strtoupper($charSet) === 'UTF-8') {
|
||||||
|
$decoded = mb_convert_encoding($input, 'ISO-8859-1', 'UTF-8');
|
||||||
|
if(mb_check_encoding($decoded, 'UTF-8'))
|
||||||
|
return $decoded;
|
||||||
|
}
|
||||||
|
|
||||||
|
return $input;
|
||||||
|
};
|
||||||
|
|
||||||
$document = new DOMDocument;
|
$document = new DOMDocument;
|
||||||
if($isXHTML) {
|
if($isXHTML) {
|
||||||
|
@ -150,8 +159,7 @@ final class WebLookup implements \Uiharu\ILookup {
|
||||||
|
|
||||||
$titleTag = $document->getElementsByTagName('title');
|
$titleTag = $document->getElementsByTagName('title');
|
||||||
foreach($titleTag as $tag) {
|
foreach($titleTag as $tag) {
|
||||||
$content = trim(mb_convert_encoding($tag->textContent, 'utf-8', $charSet));
|
$siteInfo->title = $charSetWrangle(trim($tag->textContent));
|
||||||
$siteInfo->title = $content;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -168,6 +176,9 @@ final class WebLookup implements \Uiharu\ILookup {
|
||||||
if(empty($nameAttr) || empty($valueAttr))
|
if(empty($nameAttr) || empty($valueAttr))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
$nameAttr = $charSetWrangle($nameAttr);
|
||||||
|
$valueAttr = $charSetWrangle($valueAttr);
|
||||||
|
|
||||||
switch($nameAttr) {
|
switch($nameAttr) {
|
||||||
case 'og:title':
|
case 'og:title':
|
||||||
case 'twitter:title':
|
case 'twitter:title':
|
||||||
|
|
Loading…
Reference in a new issue