%PDF- %PDF-
Direktori : /home/bitrix/www/bitrix/modules/main/lib/urlpreview/parser/ |
Current File : /home/bitrix/www/bitrix/modules/main/lib/urlpreview/parser/schemaorg.php |
<?php namespace Bitrix\Main\UrlPreview\Parser; use Bitrix\Main\Context; use Bitrix\Main\Text\Encoding; use Bitrix\Main\UrlPreview\HtmlDocument; use Bitrix\Main\UrlPreview\Parser; class SchemaOrg extends Parser { /** @var \DOMDocument */ protected $dom; /** @var array */ protected $schemaMetadata = array(); protected $documentEncoding; /** * Parses HTML document's Schema.org metadata. * * @param HtmlDocument $document */ public function handle(HtmlDocument $document) { $this->documentEncoding = $document->getEncoding(); if(strpos($document->getHtml(), 'itemscope') === false) return null; if(!$this->initializeDom($document)) return null; if(!$this->getSchemaMetadata()) return null; if(strlen($document->getTitle()) == 0 && isset($this->schemaMetadata['name'])) { $document->setTitle($this->schemaMetadata['name']); } if(strlen($document->getDescription()) == 0 && isset($this->schemaMetadata['description'])) { $document->setDescription($this->schemaMetadata['description']); } if(strlen($document->getImage()) == 0 && isset($this->schemaMetadata['image'])) { $document->setImage($this->schemaMetadata['image']); } } /** * @return bool */ protected function getSchemaMetadata() { // Starting with first node with itemscope attribute, to prevent walking over full document. $xpath = new \DOMXPath($this->dom); $itemScopeNodes = $xpath->query('//*[@itemscope]'); if(!is_a($itemScopeNodes, '\DOMNodeList') || $itemScopeNodes->length < 1) return false; $mainNode = $itemScopeNodes->item(0); if(!is_a($mainNode, '\DOMElement')) return false; $this->walkDomTree($mainNode); return true; } /** * @param \DOMElement $currentNode * @param int $currentDepth */ protected function walkDomTree(\DOMElement $currentNode, $currentDepth = 0) { $this->handleNode($currentNode); foreach($currentNode->childNodes as $childNode) { if(is_a($childNode, '\DOMElement') && !($currentDepth == 0 xor $currentNode->hasAttribute('itemscope'))) { $this->walkDomTree($childNode, $currentDepth + 1); } } } /** * @param \DOMElement $node * @return null|string */ protected function getSchemaPropertyValue(\DOMElement $node) { $result = null; switch($node->tagName) { case 'img': $result = $node->getAttribute('src'); break; case 'meta': $result = $node->getAttribute('content'); break; case 'a': $result = $node->getAttribute('href'); break; case 'time': if($node->hasAttribute('datetime')) $result = $node->getAttribute('datetime'); else $result = $node->textContent; break; case 'div': $result = $this->getNodeInnerHtml($node); break; case 'p': case 'span': case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': $result = $node->textContent; break; } // dom extension's internal encoding is always utf-8 $result = Encoding::convertEncoding($result, 'utf-8', $this->documentEncoding); $result = trim($result); return (strlen($result) > 0 ? $result : null); } /** * @param \DOMElement $node */ protected function handleNode(\DOMElement $node) { if($node->hasAttribute('itemprop') && !$node->hasAttribute('itemscope')) { $propertyName = strtolower($node->getAttribute('itemprop')); $propertyValue = $this->getSchemaPropertyValue($node); $this->schemaMetadata[$propertyName] = $propertyValue; } } /** * @param \DOMElement $element * @return string */ protected function getNodeInnerHtml(\DOMElement $element) { $innerHTML = ""; $children = $element->childNodes; foreach ($children as $child) { $innerHTML .= $element->ownerDocument->saveHTML($child); } return $innerHTML; } /** * @param HtmlDocument $document * @return bool */ protected function initializeDom(HtmlDocument $document) { if(!class_exists('DOMDocument')) { return false; } $this->dom = new \DOMDocument(); // Prevents parsing errors bubbling libxml_use_internal_errors(true); $result = $this->dom->loadHTML('<?xml encoding="'.$document->getEncoding().'">'.$document->getHtml()); return $result; } }