Fixed double encoding, probably.
This commit is contained in:
parent
d90927469f
commit
5c9b13073d
1 changed files with 21 additions and 11 deletions
|
@ -34,7 +34,7 @@ final class WebLookup implements \Uiharu\ILookup {
|
|||
CURLOPT_DEFAULT_PROTOCOL => 'https',
|
||||
CURLOPT_USERAGENT => 'Mozilla/5.0 (compatible) Uiharu/' . UIH_VERSION,
|
||||
CURLOPT_HTTPHEADER => [
|
||||
'Accept: text/html,application/xhtml+xml',
|
||||
'Accept: text/html, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8',
|
||||
],
|
||||
]);
|
||||
return $curl;
|
||||
|
@ -107,7 +107,7 @@ final class WebLookup implements \Uiharu\ILookup {
|
|||
$mediaType = MediaType::parse('application/octet-stream');
|
||||
}
|
||||
|
||||
$isXHTML = $mediaType->equals('application/xhtml+xml');
|
||||
$isXHTML = $mediaType->equals('application/xhtml+xml') || $mediaType->equals('application/xml');
|
||||
if($isXHTML || $mediaType->equals('text/html'))
|
||||
return $this->lookupSite($url, $req, $mediaType, $isXHTML);
|
||||
|
||||
|
@ -123,22 +123,21 @@ final class WebLookup implements \Uiharu\ILookup {
|
|||
$body = self::reqBody($req);
|
||||
self::reqClose($req);
|
||||
|
||||
$charSet = $mediaType->getCharset();
|
||||
|
||||
$document = new DOMDocument;
|
||||
if($isXHTML) {
|
||||
$document->loadXML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING);
|
||||
} else {
|
||||
$document->encoding = $charSet;
|
||||
$document->loadHTML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING);
|
||||
foreach($document->childNodes as $child)
|
||||
if($child->nodeType === XML_PI_NODE) {
|
||||
$document->removeChild($child);
|
||||
break;
|
||||
}
|
||||
|
||||
$document->encoding = $mediaType->getCharset();
|
||||
}
|
||||
|
||||
$charSet = $document->encoding;
|
||||
|
||||
$siteInfo = new stdClass;
|
||||
$siteInfo->title = '';
|
||||
$siteInfo->metaTitle = '';
|
||||
|
@ -150,7 +149,10 @@ final class WebLookup implements \Uiharu\ILookup {
|
|||
|
||||
$titleTag = $document->getElementsByTagName('title');
|
||||
foreach($titleTag as $tag) {
|
||||
$siteInfo->title = trim(mb_convert_encoding($tag->textContent, 'utf-8', $charSet));
|
||||
$content = trim(mb_convert_encoding($tag->textContent, 'utf-8', $charSet));
|
||||
if(strtolower($charSet) === 'utf-8' && mb_detect_encoding($decoded = utf8_decode($content)) === 'UTF-8')
|
||||
$content = $decoded;
|
||||
$siteInfo->title = $content;
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -167,25 +169,33 @@ final class WebLookup implements \Uiharu\ILookup {
|
|||
if(empty($nameAttr) || empty($valueAttr))
|
||||
continue;
|
||||
|
||||
if(strtolower($charSet) === 'utf-8' && mb_detect_encoding($decoded = utf8_decode($nameAttr)) === 'UTF-8')
|
||||
$nameAttr = $decoded;
|
||||
if(strtolower($charSet) === 'utf-8' && mb_detect_encoding($decoded = utf8_decode($valueAttr)) === 'UTF-8')
|
||||
$valueAttr = $decoded;
|
||||
|
||||
switch($nameAttr) {
|
||||
case 'og:title':
|
||||
case 'twitter:title':
|
||||
if(empty($siteInfo->metaTitle) || strlen($valueAttr) > strlen($siteInfo->metaTitle))
|
||||
$siteInfo->metaTitle = $valueAttr;
|
||||
break;
|
||||
|
||||
case 'description':
|
||||
case 'og:description':
|
||||
case 'twitter:description':
|
||||
if(empty($siteInfo->desc))
|
||||
if(empty($siteInfo->desc) || strlen($valueAttr) > strlen($siteInfo->desc))
|
||||
$siteInfo->desc = $valueAttr;
|
||||
break;
|
||||
|
||||
case 'og:site_name':
|
||||
if(empty($siteInfo->siteName))
|
||||
$siteInfo->siteName = $valueAttr;
|
||||
break;
|
||||
|
||||
case 'og:image':
|
||||
case 'twitter:image':
|
||||
if(empty($siteInfo->image))
|
||||
$siteInfo->image = $valueAttr;
|
||||
break;
|
||||
|
||||
|
|
Loading…
Reference in a new issue