Current File : /home/honehdyv/reversevillain.com/wp-content/plugins/wp-automatic/inc/class.dom.php
<?php
/**
 * class:wpAutomaticDom to extract content from html using class, id, regex or xpath
 * @author ValvePress
 * @version:1.0.0
 */
 
class wpAutomaticDom {
	
	public $html;
	public $doc;
	public $debug;
	
	function __construct($html){
		 
		//make sure encoding exits
		if( stristr($html, '<meta http-equiv="Content-Type" content="text/html; charset') || stristr($html, '<meta http-equiv="content-type" content="text/html; charset')){
			//correct
		}else{
			
			//not correct find the charset
			preg_match_all('{charset=["|\']([^"]+?)["|\']}i', $html,$encMatches);
			$possibleCharSet = $encMatches[1];
			
			$possibleCharSet = isset($possibleCharSet[0]) ? $possibleCharSet[0] : '';
		 
			if(trim($possibleCharSet) == '') $possibleCharSet = 'UTF-8';
			
			$charSetMeta = '<meta http-equiv="content-type" content="text/html; charset=' . $possibleCharSet . '"/>';
			
			if(stristr($html, '<head>')){
				$html = str_replace('<head>', '<head>'.$charSetMeta, $html);
			}else{
				$html = str_replace('</head>', $charSetMeta . '/<head>', $html);
			}
			 
			 
			
		} 
		
		// Fix tables tbody
		preg_match_all('{(<table.*?>)([\s]*<.*?>)}s', $html ,$allTablesOpenMatches ) ;
		
		$allTablesOpenMatchesTwo = $allTablesOpenMatches[0];
		$allTablesOpenMatchesOne = $allTablesOpenMatches[1];
		$allTablesOpenMatchesAfter = $allTablesOpenMatches[2];
		$i=0;
		foreach ($allTablesOpenMatchesTwo as $allTablesOpenMatchesTwoSingle){
			
			if(! stristr($allTablesOpenMatchesTwoSingle, '<tbody') && ! stristr($allTablesOpenMatchesTwoSingle, '<thead')){
				//fix this
				$html = str_replace($allTablesOpenMatchesTwoSingle, $allTablesOpenMatchesOne[$i].'<tbody>'.$allTablesOpenMatchesAfter[$i], $html);
			} 
			
			$i++;
		}
		
		preg_match_all('{(<[^<]*?>[\s]*)(</table.*?>)}s', $html ,$allTablesCloseMatches ) ;
		$allTablesCloseMatchesBoth = $allTablesCloseMatches[0];
		$allTablesCloseMatchesPre = $allTablesCloseMatches[1];
		$allTablesCloseMatchesAfter = $allTablesCloseMatches[2];
		
		$i=0;
		foreach ( $allTablesCloseMatchesBoth as $allTablesCloseMatchesBothSingle ){
			
			if( ! stristr($allTablesCloseMatchesBothSingle, 'tbody') && ! stristr($allTablesCloseMatchesBothSingle, 'tfoot')){
				$html = str_replace($allTablesCloseMatchesBothSingle, $allTablesCloseMatchesPre[$i].'</tbody>'.$allTablesCloseMatchesAfter[$i], $html ) ;
			}
			
			$i++;
		}
		
		 		
		$this->html = $html;
		$this->doc  =  new DOMDocument;
		
		try {
			$internalErrors = libxml_use_internal_errors(true);
			@$this->doc->loadHTML($html);
			libxml_use_internal_errors($internalErrors);
			 
		} catch (Exception $e) {
			throw new Exception('Failed to load the Document as a Dom');
		}
		
	}
	
	/**
	 * Get content from the dom using an XPath
	 * @param string $xpath
	 * @return string[]
	 */
	function getContentByXPath($xpath,$inner=true){
		
		// xPath object
		$xpathObj = new DOMXPath($this->doc);
		$xpathMatches = @$xpathObj->query("$xpath");
 
		
		$allMatchs= array();
		
		if($xpathMatches == false) return $allMatchs;
		
		foreach ($xpathMatches as $element) {
		 	
			
		   $matchHtml = ''; // single match html
				
		   if($inner){

		   	$nodes = $element->childNodes;
			
			foreach ($nodes as $node) {
				
				$matchHtml.=  $this->doc->saveHTML($node). "\n";
			}
			
		   }else{
		   	 $matchHtml =  $this->doc->saveHTML($element);
		   }
			
			$allMatchs[] = $matchHtml;
			
		}
		

		
		return $allMatchs;
		
	}

	/**
	 * Get childrens by XPath
	 */
	function getChildsByXPath($xpath){
		
		// xPath object
		$xpathObj = new DOMXPath($this->doc);
		$xpathMatches = @$xpathObj->query("$xpath");
		
		$allMatchs= array();
		
		if($xpathMatches == false) return $allMatchs;
		
		foreach ($xpathMatches as $element) {
			
			
			$matchHtml = array(); // single match html
 				
				$nodes = $element->childNodes;
				
				foreach ($nodes as $node) {
					
					$matchHtml[] =  $this->doc->saveHTML($node) ;
				}
			 
			$allMatchs[] = $matchHtml;
			
		}
		
		return $allMatchs;
		
	}
	
	/**
	 * Get content from dom using class name
	 * @param string $className
	 * @return string[]
	 */
	function getContentByClass($className,$inner=true){
		 
		$className = trim($className) ;
		$XPath= '//*[contains(concat (" ", normalize-space(@class), " "), " '.$className.' ")]';
		return $this->getContentByXPath($XPath,$inner) ;
	}
	
	/**
	 * Get content from dom using id
	 * @param string $id
	 * @return string[]
	 */
	function getContentByID($id,$inner=true){
		$id=trim($id);
		$XPath = "//*[@id='$id']" ;
		return $this->getContentByXPath($XPath,$inner) ;
	}	
	
	/**
	 * Get default title from title tag or h1 tag 
	 * @return string the title
	 */
	function getTheTitle(){
		
		//return title from title tag
		preg_match('{<title>(.*?)</title>}s', $this->html,$titleMatchs);
		$possibleTitle = isset($titleMatchs[1]) ?  $titleMatchs[1] : '' ;
		
		if(trim($possibleTitle) != '' ) return trim($possibleTitle); 
		
		//get from h1
		preg_match('{<h1.*?>(.*?)</h1>}s', $this->html,$titleMatchs);
		$possibleTitle = $titleMatchs[1];
		if(trim($possibleTitle) != '' ) return trim($possibleTitle);
		
		//default empty
		return '';
		
	}
	

	function getFullContent(){
		
		//readability
		require_once 'wp_automatic_readability/wp_automatic_Readability.php';
		$wp_automatic_Readability = new wp_automatic_Readability ( $this->html );
		
		$wp_automatic_Readability->debug = false;
		$result = $wp_automatic_Readability->init ();
		
		if ($result) {
			
			// Redability Content
			$content = $wp_automatic_Readability->getContent ()->innerHTML;
			
			// Remove  wp_automatic_Readability attributes
			$content = preg_replace('{ wp_automatic_Readability\=".*?"}s', '', $content);
			
			// Fix iframe if exists
			preg_match_all('{<iframe[^<]*/>}s', $content,$ifrMatches);
			$iframesFound = $ifrMatches[0];
			
			foreach ($iframesFound as $iframeFound){
				
				$correctIframe  = str_replace('/>','></iframe>',$iframeFound);
				$content = str_replace($iframeFound, $correctIframe, $content);
				
			}
			
			// Cleaning redability for better memory
			unset($wp_automatic_Readability);
			unset($result);

 
			return $content;
			
		}else{
			echo '<br>Failed to find the content.';
			return '';
		}
		
	}
	
	/**
	 * Extract content by a regex ex <h1>(.*?)</h1>
	 * @param string $regex
	 */
	function getContentByRegex($regex){
		
		preg_match_all('{'.$regex.'}is', $this->html,$matchregex);
		  
		if(isset($matchregex[1])) return $matchregex[1];
		
		if(isset($matchregex[0])) return $matchregex[0];
 		
	}
	
	/**
	 * Find the link by xPath and all similar links in a page content. used by the multi-page scraper
	 * Similar links has the same XPath [no digets] and have the similar siblings
	 * @param string $xpath
	 * @return array all similar URLs
	 */
	function getSimilarLinks($xpath){
		
		
		//refine the xpath and find the a tag
		$xpath = trim($xpath);
		if(! stristr($xpath, '/a/') &&  ! preg_match('{/a$}',$xpath)  &&  ! stristr($xpath, '/a[') ){
			throw new Exception('Provided XPath does not contain the a tag');
		}else{
			
			//good we have the a tag if it is not the last one, make it the last
			if(!preg_match('{/a$}', $xpath) && ( stristr($xpath, '/a/') || stristr($xpath, '/a[')  ) ){
				
				if(!  stristr($xpath, '/a[')  ){
					$xpathParts = explode('/a/', $xpath);
					$lastPartIndex = count($xpathParts) - 1;
					unset($xpathParts[$lastPartIndex]);
					$xpath=implode('/a/',$xpathParts).'/a';
				}else{
					$xpath = preg_replace( '!(a\[\d*?\]).*!', "$1", $xpath);
				}
				
				
			}
			
			 
			//find the chosen node
			$xpathObj = new DOMXPath($this->doc);
			$xpathMatches = @$xpathObj->query("$xpath");
			 
			
			if(  isset($xpathMatches[0]) ) {
			 
			//well we have a match
			$choseNode = $xpathMatches[0];
			$chosenNodePath = ($choseNode->getNodePath());
		 
			}else{
				echo '<br>Failed to get the alleged node, trusting the provided XPath instead';
				$chosenNodePath = $xpath;
			}

			$chosenNodePathNoDigits = preg_replace('{\[\d*?\]}', '[]', $chosenNodePath);
			$chosenNodePathParts = explode('/', $chosenNodePath);
			
			
			echo '<br>Chosen node dom XPath:'.$chosenNodePath;
			
			if($this->debug) print_r($chosenNodePathParts);

			//get all links in the dom
			$LinksWithSimilarPath = array();
			$LinksWithSimilarPathTitles = array();
			$LinksWithSimilarPathStrict = array();
			
			
			$allLinksMatches =  @$xpathObj->query("//a");
			
			  
			foreach ($allLinksMatches as $singleLink){
				$currentNodePath = $singleLink->getNodePath() ;
			 
				$currentNodePathNoDigits = preg_replace('{\[\d*?\]}', '[]', $currentNodePath );
				if($currentNodePathNoDigits == $chosenNodePathNoDigits){
					
					$LinksWithSimilarPath[] = $singleLink->getAttribute('href');
					$LinksWithSimilarPathTitles[] = $singleLink->nodeValue;
					
					if($this->debug) echo "\n\n<br><br>".$singleLink->getNodePath().'<br><br>'.$singleLink->nodeValue . '<br><br>' ;
					
					//verify num of changes in xPath
					$numOfChanges = 0;
					$currentNodePathParts = explode('/', $currentNodePath);
					$nodeChangeIndex = 0 ; //where exactly there were a change
					$i = 0 ;
					
					foreach ($currentNodePathParts as $currentPart){
					 
						if($currentPart != $chosenNodePathParts[$i]){
							$nodeChangeIndex = $i;
							$numOfChanges++;
						} 
					
						$i++;
						
						
					}
					
					if($numOfChanges < 2 ){
						
						$LinksWithSimilarPathStrict[] = $singleLink->getAttribute('href');
						$LinksWithSimilarPathStrictTitles[] = $singleLink->nodeValue;
						$changeIndexArr[] = $nodeChangeIndex;
					} 
					
				
				}
			}
			
			if($this->debug){
				
				echo "\n<br>------- Strict similar XPath---------\n<br>";
				print_r($LinksWithSimilarPathStrict);
				print_r($changeIndexArr);
			}
			 
			
			if(count($LinksWithSimilarPathStrict) > 5){
				
				//better results are here lets find the odd result if any
				$values = array_count_values($changeIndexArr);
				arsort($values);
				
				if($this->debug){
					echo "\n<br>-------Most common change part index  ---------\n<br>";
					print_r($values);
				}
				
				//fix this line
				 $changeArrKeys = array_keys($values) ;
				 $correctNodeIndex = $changeArrKeys[0]; // index of the change 
				
				 echo "\n<br> All Links:" . count($allLinksMatches) . ", Similar links:".count($LinksWithSimilarPath) . " & Most similar:".count($LinksWithSimilarPathStrict); 
				 
				 if($this->debug)
				 echo "\n<br> Correct change XPath index:".$correctNodeIndex;
				 
				if(is_numeric($correctNodeIndex) ){
					foreach ($changeIndexArr as $changeKey => $changeValue){
						if($changeValue !=0 && $changeValue != $correctNodeIndex){
							unset($LinksWithSimilarPathStrict[$changeKey]);
							unset($LinksWithSimilarPathStrictTitles[$changeKey]);
						}
					}
				}
			  	
				return array($LinksWithSimilarPathStrict ,$LinksWithSimilarPathStrictTitles) ;
			}else{
				return array($LinksWithSimilarPath , $LinksWithSimilarPathTitles);
			}
 			
		}
	}
	
}