Fixed double encoding, probably.
This commit is contained in:
parent
d90927469f
commit
5c9b13073d
1 changed files with 21 additions and 11 deletions
|
@ -34,7 +34,7 @@ final class WebLookup implements \Uiharu\ILookup {
|
||||||
CURLOPT_DEFAULT_PROTOCOL => 'https',
|
CURLOPT_DEFAULT_PROTOCOL => 'https',
|
||||||
CURLOPT_USERAGENT => 'Mozilla/5.0 (compatible) Uiharu/' . UIH_VERSION,
|
CURLOPT_USERAGENT => 'Mozilla/5.0 (compatible) Uiharu/' . UIH_VERSION,
|
||||||
CURLOPT_HTTPHEADER => [
|
CURLOPT_HTTPHEADER => [
|
||||||
'Accept: text/html,application/xhtml+xml',
|
'Accept: text/html, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8',
|
||||||
],
|
],
|
||||||
]);
|
]);
|
||||||
return $curl;
|
return $curl;
|
||||||
|
@ -107,7 +107,7 @@ final class WebLookup implements \Uiharu\ILookup {
|
||||||
$mediaType = MediaType::parse('application/octet-stream');
|
$mediaType = MediaType::parse('application/octet-stream');
|
||||||
}
|
}
|
||||||
|
|
||||||
$isXHTML = $mediaType->equals('application/xhtml+xml');
|
$isXHTML = $mediaType->equals('application/xhtml+xml') || $mediaType->equals('application/xml');
|
||||||
if($isXHTML || $mediaType->equals('text/html'))
|
if($isXHTML || $mediaType->equals('text/html'))
|
||||||
return $this->lookupSite($url, $req, $mediaType, $isXHTML);
|
return $this->lookupSite($url, $req, $mediaType, $isXHTML);
|
||||||
|
|
||||||
|
@ -123,22 +123,21 @@ final class WebLookup implements \Uiharu\ILookup {
|
||||||
$body = self::reqBody($req);
|
$body = self::reqBody($req);
|
||||||
self::reqClose($req);
|
self::reqClose($req);
|
||||||
|
|
||||||
|
$charSet = $mediaType->getCharset();
|
||||||
|
|
||||||
$document = new DOMDocument;
|
$document = new DOMDocument;
|
||||||
if($isXHTML) {
|
if($isXHTML) {
|
||||||
$document->loadXML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING);
|
$document->loadXML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING);
|
||||||
} else {
|
} else {
|
||||||
|
$document->encoding = $charSet;
|
||||||
$document->loadHTML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING);
|
$document->loadHTML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING);
|
||||||
foreach($document->childNodes as $child)
|
foreach($document->childNodes as $child)
|
||||||
if($child->nodeType === XML_PI_NODE) {
|
if($child->nodeType === XML_PI_NODE) {
|
||||||
$document->removeChild($child);
|
$document->removeChild($child);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
$document->encoding = $mediaType->getCharset();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$charSet = $document->encoding;
|
|
||||||
|
|
||||||
$siteInfo = new stdClass;
|
$siteInfo = new stdClass;
|
||||||
$siteInfo->title = '';
|
$siteInfo->title = '';
|
||||||
$siteInfo->metaTitle = '';
|
$siteInfo->metaTitle = '';
|
||||||
|
@ -150,7 +149,10 @@ final class WebLookup implements \Uiharu\ILookup {
|
||||||
|
|
||||||
$titleTag = $document->getElementsByTagName('title');
|
$titleTag = $document->getElementsByTagName('title');
|
||||||
foreach($titleTag as $tag) {
|
foreach($titleTag as $tag) {
|
||||||
$siteInfo->title = trim(mb_convert_encoding($tag->textContent, 'utf-8', $charSet));
|
$content = trim(mb_convert_encoding($tag->textContent, 'utf-8', $charSet));
|
||||||
|
if(strtolower($charSet) === 'utf-8' && mb_detect_encoding($decoded = utf8_decode($content)) === 'UTF-8')
|
||||||
|
$content = $decoded;
|
||||||
|
$siteInfo->title = $content;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -167,25 +169,33 @@ final class WebLookup implements \Uiharu\ILookup {
|
||||||
if(empty($nameAttr) || empty($valueAttr))
|
if(empty($nameAttr) || empty($valueAttr))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
if(strtolower($charSet) === 'utf-8' && mb_detect_encoding($decoded = utf8_decode($nameAttr)) === 'UTF-8')
|
||||||
|
$nameAttr = $decoded;
|
||||||
|
if(strtolower($charSet) === 'utf-8' && mb_detect_encoding($decoded = utf8_decode($valueAttr)) === 'UTF-8')
|
||||||
|
$valueAttr = $decoded;
|
||||||
|
|
||||||
switch($nameAttr) {
|
switch($nameAttr) {
|
||||||
case 'og:title':
|
case 'og:title':
|
||||||
case 'twitter:title':
|
case 'twitter:title':
|
||||||
|
if(empty($siteInfo->metaTitle) || strlen($valueAttr) > strlen($siteInfo->metaTitle))
|
||||||
$siteInfo->metaTitle = $valueAttr;
|
$siteInfo->metaTitle = $valueAttr;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'description':
|
case 'description':
|
||||||
case 'og:description':
|
case 'og:description':
|
||||||
case 'twitter:description':
|
case 'twitter:description':
|
||||||
if(empty($siteInfo->desc))
|
if(empty($siteInfo->desc) || strlen($valueAttr) > strlen($siteInfo->desc))
|
||||||
$siteInfo->desc = $valueAttr;
|
$siteInfo->desc = $valueAttr;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'og:site_name':
|
case 'og:site_name':
|
||||||
|
if(empty($siteInfo->siteName))
|
||||||
$siteInfo->siteName = $valueAttr;
|
$siteInfo->siteName = $valueAttr;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'og:image':
|
case 'og:image':
|
||||||
case 'twitter:image':
|
case 'twitter:image':
|
||||||
|
if(empty($siteInfo->image))
|
||||||
$siteInfo->image = $valueAttr;
|
$siteInfo->image = $valueAttr;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue