Fixed issues caused by websites lying about their character set.

2023-10-21 17:33:42 +00:00 · 2023-10-21 17:33:42 +00:00 · 842e91a413
commit 842e91a413
parent 1f256a40ca
1 changed files with 13 additions and 2 deletions
--- a/src/Lookup/WebLookup.php
+++ b/src/Lookup/WebLookup.php
@ -125,6 +125,15 @@ final class WebLookup implements \Uiharu\ILookup {
        self::reqClose($req);
        $charSet = $mediaType->getCharset();
        $charSetWrangle = function(string $input) use ($charSet): string {
            if(strtoupper($charSet) === 'UTF-8') {
                $decoded = mb_convert_encoding($input, 'ISO-8859-1', 'UTF-8');
                if(mb_check_encoding($decoded, 'UTF-8'))
                    return $decoded;
            }
            return $input;
        };
        $document = new DOMDocument;
        if($isXHTML) {
@ -150,8 +159,7 @@ final class WebLookup implements \Uiharu\ILookup {
        $titleTag = $document->getElementsByTagName('title');
        foreach($titleTag as $tag) {
-            $content = trim(mb_convert_encoding($tag->textContent, 'utf-8', $charSet));
+            $siteInfo->title = $charSetWrangle(trim($tag->textContent));
            $siteInfo->title = $content;
            break;
        }
@ -168,6 +176,9 @@ final class WebLookup implements \Uiharu\ILookup {
            if(empty($nameAttr) || empty($valueAttr))
                continue;
            $nameAttr = $charSetWrangle($nameAttr);
            $valueAttr = $charSetWrangle($valueAttr);
            switch($nameAttr) {
                case 'og:title':
                case 'twitter:title':