382 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			PHP
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			382 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			PHP
		
	
	
		
			Executable File
		
	
	
	
	
<?php
 | 
						|
/**
 | 
						|
 *  base include file for SimpleTest
 | 
						|
 *  @package    SimpleTest
 | 
						|
 *  @subpackage WebTester
 | 
						|
 *  @version    $Id: php_parser.php 1911 2009-07-29 16:38:04Z lastcraft $
 | 
						|
 */
 | 
						|
 | 
						|
/**
 | 
						|
 *    Builds the page object.
 | 
						|
 *    @package SimpleTest
 | 
						|
 *    @subpackage WebTester
 | 
						|
 */
 | 
						|
class SimpleTidyPageBuilder {
 | 
						|
    private $page;
 | 
						|
    private $forms = array();
 | 
						|
    private $labels = array();
 | 
						|
    private $widgets_by_id = array();
 | 
						|
 | 
						|
    public function __destruct() {
 | 
						|
        $this->free();
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *    Frees up any references so as to allow the PHP garbage
 | 
						|
     *    collection from unset() to work.
 | 
						|
     */
 | 
						|
    private function free() {
 | 
						|
        unset($this->page);
 | 
						|
        $this->forms = array();
 | 
						|
        $this->labels = array();
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *    This builder is only available if the 'tidy' extension is loaded.
 | 
						|
     *    @return boolean       True if available.
 | 
						|
     */
 | 
						|
    function can() {
 | 
						|
        return extension_loaded('tidy');
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *    Reads the raw content the page using HTML Tidy.
 | 
						|
     *    @param $response SimpleHttpResponse  Fetched response.
 | 
						|
     *    @return SimplePage                   Newly parsed page.
 | 
						|
     */
 | 
						|
    function parse($response) {
 | 
						|
        $this->page = new SimplePage($response);
 | 
						|
        $tidied = tidy_parse_string($input = $this->insertGuards($response->getContent()),
 | 
						|
                                    array('output-xml' => false, 'wrap' => '0', 'indent' => 'no'),
 | 
						|
                                    'latin1');
 | 
						|
        $this->walkTree($tidied->html());
 | 
						|
        $this->attachLabels($this->widgets_by_id, $this->labels);
 | 
						|
        $this->page->setForms($this->forms);
 | 
						|
        $page = $this->page;
 | 
						|
        $this->free();
 | 
						|
        return $page;
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *    Stops HTMLTidy stripping content that we wish to preserve.
 | 
						|
     *    @param string      The raw html.
 | 
						|
     *    @return string     The html with guard tags inserted.
 | 
						|
     */
 | 
						|
    private function insertGuards($html) {
 | 
						|
        return $this->insertEmptyTagGuards($this->insertTextareaSimpleWhitespaceGuards($html));
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *    Removes the extra content added during the parse stage
 | 
						|
     *    in order to preserve content we don't want stripped
 | 
						|
     *    out by HTMLTidy.
 | 
						|
     *    @param string      The raw html.
 | 
						|
     *    @return string     The html with guard tags removed.
 | 
						|
     */
 | 
						|
    private function stripGuards($html) {
 | 
						|
        return $this->stripTextareaWhitespaceGuards($this->stripEmptyTagGuards($html));
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *    HTML tidy strips out empty tags such as <option> which we
 | 
						|
     *    need to preserve. This method inserts an additional marker.
 | 
						|
     *    @param string      The raw html.
 | 
						|
     *    @return string     The html with guards inserted.
 | 
						|
     */
 | 
						|
    private function insertEmptyTagGuards($html) {
 | 
						|
        return preg_replace('#<(option|textarea)([^>]*)>(\s*)</(option|textarea)>#is',
 | 
						|
                            '<\1\2>___EMPTY___\3</\4>',
 | 
						|
                            $html);
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *    HTML tidy strips out empty tags such as <option> which we
 | 
						|
     *    need to preserve. This method strips additional markers
 | 
						|
     *    inserted by SimpleTest to the tidy output used to make the
 | 
						|
     *    tags non-empty. This ensures their preservation.
 | 
						|
     *    @param string      The raw html.
 | 
						|
     *    @return string     The html with guards removed.
 | 
						|
     */
 | 
						|
    private function stripEmptyTagGuards($html) {
 | 
						|
        return preg_replace('#(^|>)(\s*)___EMPTY___(\s*)(</|$)#i', '\2\3', $html);
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *    By parsing the XML output of tidy, we lose some whitespace
 | 
						|
     *    information in textarea tags. We temporarily recode this
 | 
						|
     *    data ourselves so as not to lose it.
 | 
						|
     *    @param string      The raw html.
 | 
						|
     *    @return string     The html with guards inserted.
 | 
						|
     */
 | 
						|
    private function insertTextareaSimpleWhitespaceGuards($html) {
 | 
						|
        return preg_replace_callback('#<textarea([^>]*)>(.*?)</textarea>#is',
 | 
						|
                                     array($this, 'insertWhitespaceGuards'),
 | 
						|
                                     $html);
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *  Callback for insertTextareaSimpleWhitespaceGuards().
 | 
						|
     *  @param array $matches       Result of preg_replace_callback().
 | 
						|
     *  @return string              Guard tags now replace whitespace.
 | 
						|
     */
 | 
						|
    private function insertWhitespaceGuards($matches) {
 | 
						|
        return '<textarea' . $matches[1] . '>' .
 | 
						|
                str_replace(array("\n", "\r", "\t", ' '),
 | 
						|
                            array('___NEWLINE___', '___CR___', '___TAB___', '___SPACE___'),
 | 
						|
                            $matches[2]) .
 | 
						|
                '</textarea>';
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *    Removes the whitespace preserving guards we added
 | 
						|
     *    before parsing.
 | 
						|
     *    @param string      The raw html.
 | 
						|
     *    @return string     The html with guards removed.
 | 
						|
     */
 | 
						|
    private function stripTextareaWhitespaceGuards($html) {
 | 
						|
        return str_replace(array('___NEWLINE___', '___CR___', '___TAB___', '___SPACE___'),
 | 
						|
                           array("\n", "\r", "\t", ' '),
 | 
						|
                           $html);
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *  Visits the given node and all children
 | 
						|
     *  @param object $node      Tidy XML node.
 | 
						|
     */
 | 
						|
    private function walkTree($node) {
 | 
						|
        if ($node->name == 'a') {
 | 
						|
            $this->page->addLink($this->tags()->createTag($node->name, (array)$node->attribute)
 | 
						|
                                        ->addContent($this->innerHtml($node)));
 | 
						|
        } elseif ($node->name == 'base' and isset($node->attribute['href'])) {
 | 
						|
            $this->page->setBase($node->attribute['href']);
 | 
						|
        } elseif ($node->name == 'title') {
 | 
						|
            $this->page->setTitle($this->tags()->createTag($node->name, (array)$node->attribute)
 | 
						|
                                         ->addContent($this->innerHtml($node)));
 | 
						|
        } elseif ($node->name == 'frameset') {
 | 
						|
            $this->page->setFrames($this->collectFrames($node));
 | 
						|
        } elseif ($node->name == 'form') {
 | 
						|
            $this->forms[] = $this->walkForm($node, $this->createEmptyForm($node));
 | 
						|
        } elseif ($node->name == 'label') {
 | 
						|
            $this->labels[] = $this->tags()->createTag($node->name, (array)$node->attribute)
 | 
						|
                                           ->addContent($this->innerHtml($node));
 | 
						|
        } else {
 | 
						|
            $this->walkChildren($node);
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *  Helper method for traversing the XML tree.
 | 
						|
     *  @param object $node     Tidy XML node.
 | 
						|
     */
 | 
						|
    private function walkChildren($node) {
 | 
						|
        if ($node->hasChildren()) {
 | 
						|
            foreach ($node->child as $child) {
 | 
						|
                $this->walkTree($child);
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *  Facade for forms containing preparsed widgets.
 | 
						|
     *  @param object $node     Tidy XML node.
 | 
						|
     *  @return SimpleForm      Facade for SimpleBrowser.
 | 
						|
     */
 | 
						|
    private function createEmptyForm($node) {
 | 
						|
        return new SimpleForm($this->tags()->createTag($node->name, (array)$node->attribute), $this->page);
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *  Visits the given node and all children
 | 
						|
     *  @param object $node      Tidy XML node.
 | 
						|
     */
 | 
						|
    private function walkForm($node, $form, $enclosing_label = '') {
 | 
						|
        if ($node->name == 'a') {
 | 
						|
            $this->page->addLink($this->tags()->createTag($node->name, (array)$node->attribute)
 | 
						|
                                              ->addContent($this->innerHtml($node)));
 | 
						|
        } elseif (in_array($node->name, array('input', 'button', 'textarea', 'select'))) {
 | 
						|
            $this->addWidgetToForm($node, $form, $enclosing_label);
 | 
						|
        } elseif ($node->name == 'label') {
 | 
						|
            $this->labels[] = $this->tags()->createTag($node->name, (array)$node->attribute)
 | 
						|
                                           ->addContent($this->innerHtml($node));
 | 
						|
            if ($node->hasChildren()) {
 | 
						|
                foreach ($node->child as $child) {
 | 
						|
                    $this->walkForm($child, $form, SimplePage::normalise($this->innerHtml($node)));
 | 
						|
                }
 | 
						|
            }
 | 
						|
        } elseif ($node->hasChildren()) {
 | 
						|
            foreach ($node->child as $child) {
 | 
						|
                $this->walkForm($child, $form);
 | 
						|
            }
 | 
						|
        }
 | 
						|
        return $form;
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *  Tests a node for a "for" atribute. Used for
 | 
						|
     *  attaching labels.
 | 
						|
     *  @param object $node      Tidy XML node.
 | 
						|
     *  @return boolean          True if the "for" attribute exists.
 | 
						|
     */
 | 
						|
    private function hasFor($node) {
 | 
						|
        return isset($node->attribute) and $node->attribute['for'];
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *  Adds the widget into the form container.
 | 
						|
     *  @param object $node             Tidy XML node of widget.
 | 
						|
     *  @param SimpleForm $form         Form to add it to.
 | 
						|
     *  @param string $enclosing_label  The label of any label
 | 
						|
     *                                  tag we might be in.
 | 
						|
     */
 | 
						|
    private function addWidgetToForm($node, $form, $enclosing_label) {
 | 
						|
        $widget = $this->tags()->createTag($node->name, $this->attributes($node));
 | 
						|
        if (! $widget) {
 | 
						|
            return;
 | 
						|
        }
 | 
						|
        $widget->setLabel($enclosing_label)
 | 
						|
               ->addContent($this->innerHtml($node));
 | 
						|
        if ($node->name == 'select') {
 | 
						|
            $widget->addTags($this->collectSelectOptions($node));
 | 
						|
        }
 | 
						|
        $form->addWidget($widget);
 | 
						|
        $this->indexWidgetById($widget);
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *  Fills the widget cache to speed up searching.
 | 
						|
     *  @param SimpleTag $widget    Parsed widget to cache.
 | 
						|
     */
 | 
						|
    private function indexWidgetById($widget) {
 | 
						|
        $id = $widget->getAttribute('id');
 | 
						|
        if (! $id) {
 | 
						|
            return;
 | 
						|
        }
 | 
						|
        if (! isset($this->widgets_by_id[$id])) {
 | 
						|
            $this->widgets_by_id[$id] = array();
 | 
						|
        }
 | 
						|
        $this->widgets_by_id[$id][] = $widget;
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *  Parses the options from inside an XML select node.
 | 
						|
     *  @param object $node      Tidy XML node.
 | 
						|
     *  @return array            List of SimpleTag options.
 | 
						|
     */
 | 
						|
    private function collectSelectOptions($node) {
 | 
						|
        $options = array();
 | 
						|
        if ($node->name == 'option') {
 | 
						|
            $options[] = $this->tags()->createTag($node->name, $this->attributes($node))
 | 
						|
                                      ->addContent($this->innerHtml($node));
 | 
						|
        }
 | 
						|
        if ($node->hasChildren()) {
 | 
						|
            foreach ($node->child as $child) {
 | 
						|
                $options = array_merge($options, $this->collectSelectOptions($child));
 | 
						|
            }
 | 
						|
        }
 | 
						|
        return $options;
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *  Convenience method for collecting all the attributes
 | 
						|
     *  of a tag. Not sure why Tidy does not have this.
 | 
						|
     *  @param object $node      Tidy XML node.
 | 
						|
     *  @return array            Hash of attribute strings.
 | 
						|
     */
 | 
						|
    private function attributes($node) {
 | 
						|
        if (! preg_match('|<[^ ]+\s(.*?)/?>|s', $node->value, $first_tag_contents)) {
 | 
						|
            return array();
 | 
						|
        }
 | 
						|
        $attributes = array();
 | 
						|
        preg_match_all('/\S+\s*=\s*\'[^\']*\'|(\S+\s*=\s*"[^"]*")|([^ =]+\s*=\s*[^ "\']+?)|[^ "\']+/', $first_tag_contents[1], $matches);
 | 
						|
        foreach($matches[0] as $unparsed) {
 | 
						|
            $attributes = $this->mergeAttribute($attributes, $unparsed);
 | 
						|
        }
 | 
						|
        return $attributes;
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *  Overlay an attribute into the attributes hash.
 | 
						|
     *  @param array $attributes        Current attribute list.
 | 
						|
     *  @param string $raw              Raw attribute string with
 | 
						|
     *                                  both key and value.
 | 
						|
     *  @return array                   New attribute hash.
 | 
						|
     */
 | 
						|
    private function mergeAttribute($attributes, $raw) {
 | 
						|
        $parts = explode('=', $raw);
 | 
						|
        list($name, $value) = count($parts) == 1 ? array($parts[0], $parts[0]) : $parts;
 | 
						|
        $attributes[trim($name)] = html_entity_decode($this->dequote(trim($value)), ENT_QUOTES);
 | 
						|
        return $attributes;
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *  Remove start and end quotes.
 | 
						|
     *  @param string $quoted    A quoted string.
 | 
						|
     *  @return string           Quotes are gone.
 | 
						|
     */
 | 
						|
    private function dequote($quoted) {
 | 
						|
        if (preg_match('/^(\'([^\']*)\'|"([^"]*)")$/', $quoted, $matches)) {
 | 
						|
            return isset($matches[3]) ? $matches[3] : $matches[2];
 | 
						|
        }
 | 
						|
        return $quoted;
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *  Collects frame information inside a frameset tag.
 | 
						|
     *  @param object $node     Tidy XML node.
 | 
						|
     *  @return array           List of SimpleTag frame descriptions.
 | 
						|
     */
 | 
						|
    private function collectFrames($node) {
 | 
						|
        $frames = array();
 | 
						|
        if ($node->name == 'frame') {
 | 
						|
            $frames = array($this->tags()->createTag($node->name, (array)$node->attribute));
 | 
						|
        } else if ($node->hasChildren()) {
 | 
						|
            $frames = array();
 | 
						|
            foreach ($node->child as $child) {
 | 
						|
                $frames = array_merge($frames, $this->collectFrames($child));
 | 
						|
            }
 | 
						|
        }
 | 
						|
        return $frames;
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *  Extracts the XML node text.
 | 
						|
     *  @param object $node     Tidy XML node.
 | 
						|
     *  @return string          The text only.
 | 
						|
     */
 | 
						|
    private function innerHtml($node) {
 | 
						|
        $raw = '';
 | 
						|
        if ($node->hasChildren()) {
 | 
						|
            foreach ($node->child as $child) {
 | 
						|
                $raw .= $child->value;
 | 
						|
            }
 | 
						|
        }
 | 
						|
        return $this->stripGuards($raw);
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *  Factory for parsed content holders.
 | 
						|
     *  @return SimpleTagBuilder    Factory.
 | 
						|
     */
 | 
						|
    private function tags() {
 | 
						|
        return new SimpleTagBuilder();
 | 
						|
    }
 | 
						|
 | 
						|
    /**
 | 
						|
     *  Called at the end of a parse run. Attaches any
 | 
						|
     *  non-wrapping labels to their form elements.
 | 
						|
     *  @param array $widgets_by_id     Cached SimpleTag hash.
 | 
						|
     *  @param array $labels            SimpleTag label elements.
 | 
						|
     */
 | 
						|
    private function attachLabels($widgets_by_id, $labels) {
 | 
						|
        foreach ($labels as $label) {
 | 
						|
            $for = $label->getFor();
 | 
						|
            if ($for and isset($widgets_by_id[$for])) {
 | 
						|
                $text = $label->getText();
 | 
						|
                foreach ($widgets_by_id[$for] as $widget) {
 | 
						|
                    $widget->setLabel($text);
 | 
						|
                }
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
?>
 |