Current File : /home/jvzmxxx/wiki1/extensions/Flow/includes/Search/Searcher.php
<?php

namespace Flow\Search;

use Elastica\Query;
use Elastica\Query\QueryString;
use Elastica\Exception\ExceptionInterface;
use Elastica\Request;
use Flow\Container;
use PoolCounterWorkViaCallback;
use Status;

class Searcher {
	const HIGHLIGHT_FIELD = 'revisions.text';
	const HIGHLIGHT_PRE = '<span class="searchmatch">';
	const HIGHLIGHT_POST = '</span>';

	/**
	 * @var string|false $type
	 */
	protected $type = false;

	/**
	 * @var string
	 */
	protected $indexBaseName;

	/**
	 * @var Query
	 */
	protected $query;

	/**
	 * @var Connection
	 */
	protected $connection;

	/**
	 * @param Query $query
	 * @param string|bool $index Base name for index to search from, defaults to wfWikiID()
	 * @param string|bool $type Type of revisions to retrieve, defaults to all
	 */
	public function __construct( Query $query, $index = false, $type = false ) {
		$this->query = $query;
		$this->indexBaseName = $index ?: wfWikiID();
		$this->type = $type;
		$this->connection = Container::get( 'search.connection' );
	}

	/**
	 * Search revisions with provided term.
	 *
	 * @param string $term Term to search
	 * @return Status
	 */
	public function searchText( $term ) {
		// full-text search
		$queryString = new QueryString( $term );
		$queryString->setFields( array( 'revisions.text' ) );
		$this->query->setQuery( $queryString );

		// add aggregation to determine exact amount of matching search terms
		$terms = $this->getTerms( $term );
		$this->query->addAggregation( $this->termsAggregation( $terms ) );

		// @todo: abstract-away this config? (core/cirrus also has this - share it somehow?)
		$this->query->setHighlight( array(
			'fields' => array(
				static::HIGHLIGHT_FIELD => array(
					'type' => 'plain',
					'order' => 'score',

					// we want just 1 excerpt of result text, which includes all highlights
					'number_of_fragments' => 1,
					'fragment_size' => 10000, // We want the whole value but more than this is crazy
				),
			),
			'pre_tags' => array( static::HIGHLIGHT_PRE ),
			'post_tags' => array( static::HIGHLIGHT_POST ),
		) );

		// @todo: support insource: queries (and perhaps others)

		$searchable = $this->connection->getFlowIndex( $this->indexBaseName );
		if ( $this->type !== false ) {
			$searchable = $searchable->getType( $this->type );
		}
		$search = $searchable->createSearch( $this->query );

		// @todo: PoolCounter config at PoolCounterSettings-eqiad.php
		// @todo: do we want this class to extend from ElasticsearchIntermediary and use its success & failure methods (like CirrusSearch/Searcher does)?

		// Perform the search
		$work = new PoolCounterWorkViaCallback( 'Flow-Search', "_elasticsearch", array(
			'doWork' => function() use ( $search ) {
				try {
					$result = $search->search();
					return Status::newGood( $result );
				} catch ( ExceptionInterface $e ) {
					if ( strpos( $e->getMessage(), 'dynamic scripting for [groovy] disabled' ) ) {
						// known issue with default ES config, let's display a more helpful message
						return Status::newFatal( new \RawMessage(
							"Couldn't complete search: dynamic scripting needs to be enabled. " .
							"Please add 'script.disable_dynamic: false' to your elasticsearch.yml"
						) );
					}

					return Status::newFatal( 'flow-error-search' );
				}
			},
			'error' => function( Status $status ) {
				$status = $status->getErrorsArray();
				wfLogWarning( 'Pool error searching Elasticsearch: ' . $status[0][0] );
				return Status::newFatal( 'flow-error-search' );
			}
		) );

		$result = $work->execute();

		return $result;
	}

	/**
	 * We want to retrieve the total amount of search word hits
	 * (static::termsAggregation) but our search terms may not be how
	 * ElasticSearch stores the words in its index.
	 * Elastic will "analyze" text (perform stemming, etc) and store
	 * the terms in a normalized way.
	 * AFAICT, there is not really a way to get to that information
	 * from within a search query.
	 *
	 * Luckily, since 1.0, Elastic supports _termvector, which gives
	 * you statistics about the terms in your document.
	 * Since 1.4, Elastic supports feeding _termvector documents to
	 * analyze.
	 * We're going to (ab)use this by letting it respond with term
	 * information on a bogus document that contains only our current
	 * search terms.
	 * So we'll give it a document with just our keywords for the
	 * column that we're searching in (revisions.text) and Elastic will
	 * use that column's configuration to analyze the text we feed it.
	 * It will then respond with the normalized terms & their stats.
	 *
	 * @param string $terms
	 * @return array
	 */
	protected function getTerms( $terms ) {
		$terms = preg_split( '/\s+/', $terms );

		// _termvectors only works on a type, but our types are
		// configured exactly the same so it doesn't matter which
		$types = Connection::getAllTypes();
		$searchable = $this->connection->getFlowIndex( $this->indexBaseName );
		$searchable = $searchable->getType( array_pop( $types ) );

		$query = array(
			// bogus document that contains the current search term
			'doc' => array(
				'revisions' => array(
					'text' => $terms,
				),
			),
			"fields" => array( "revisions.text" ),
		);

		// Elastica has no abstraction over _termvector like it has
		// for _query, so just do the request ourselves
		$response = $searchable->request(
			'_termvector',
			Request::POST,
			$query,
			array()
		);

		$data = $response->getData();
		return array_keys( $data['term_vectors']['revisions.text']['terms'] );
	}

	/**
	 * We can only do this if dynamic scripting is enabled. In elasticsearch.yml:
	 * script.disable_dynamic: false
	 * @see vendor/ruffin/elastica/test/bin/run_elasticsearch.sh
	 *
	 * @param array $terms
	 * @return \Elastica\Aggregation\Sum
	 */
	protected function termsAggregation( array $terms ) {
		$terms = str_replace( '"', '\\"', $terms );

		$script = '
keywords = ["' . implode( '","', $terms ) . '"]
total = 0
for (term in keywords) {
	total += _index["revisions.text"][term].tf()
}
return total';
		$script = new \Elastica\Script( $script, null, 'groovy' );

		$aggregation = new \Elastica\Aggregation\Sum( 'ttf' );
		// $aggregation->setScript() doesn't seem to properly set 'lang': 'groovy'
		// see https://github.com/ruflin/Elastica/pull/748
		// $aggregation->setScript( $script );
		$aggregation->setParams( array( 'lang' => 'groovy' ) );
		$aggregation->setParam( 'script', $script->getScript() );

		return $aggregation;
	}
}