Current File : /home/jvzmxxx/wiki/extensions/Wikibase/repo/includes/Dumpers/DumpGenerator.php
<?php

namespace Wikibase\Dumpers;

use InvalidArgumentException;
use LogicException;
use Wikibase\DataModel\Entity\EntityId;
use Wikibase\DataModel\Services\Lookup\EntityLookupException;
use Wikibase\DataModel\Services\Entity\EntityPrefetcher;
use Wikibase\Lib\Reporting\ExceptionHandler;
use Wikibase\Lib\Reporting\MessageReporter;
use Wikibase\Lib\Reporting\NullMessageReporter;
use Wikibase\Lib\Reporting\RethrowingExceptionHandler;
use Wikibase\Lib\Store\StorageException;
use Wikibase\Repo\Store\EntityIdPager;

/**
 * DumpGenerator generates a dump of a given set of entities, excluding
 * redirects.
 *
 * @since 0.5
 *
 * @license GPL-2.0+
 * @author Daniel Kinzler
 */
abstract class DumpGenerator {

	/**
	 * @var int The max number of entities to process in a single batch.
	 *      Also controls the interval for progress reports.
	 */
	private $batchSize = 100;

	/**
	 * @var resource File handle for output
	 */
	protected $out;

	/**
	 * @var int Total number of shards a request should be split into
	 */
	protected $shardingFactor = 1;

	/**
	 * @var int Number of the requested shard
	 */
	protected $shard = 0;

	/**
	 * @var MessageReporter
	 */
	protected $progressReporter;

	/**
	 * @var ExceptionHandler
	 */
	protected $exceptionHandler;

	/**
	 * @var EntityPrefetcher
	 */
	protected $entityPrefetcher;

	/**
	 * @var string
	 */
	protected $entityType;

	/**
	 * Entity count limit - dump will generate this many
	 *
	 * @var int
	 */
	protected $limit = 0;

	/**
	 * @param resource $out
	 * @param EntityPrefetcher $entityPrefetcher
	 *
	 * @throws InvalidArgumentException
	 */
	public function __construct( $out, EntityPrefetcher $entityPrefetcher ) {
		if ( !is_resource( $out ) ) {
			throw new InvalidArgumentException( '$out must be a file handle!' );
		}

		$this->out = $out;

		$this->entityPrefetcher = $entityPrefetcher;
		$this->progressReporter = new NullMessageReporter();
		$this->exceptionHandler = new RethrowingExceptionHandler();
	}

	/**
	 * Set maximum number of entities produced
	 *
	 * @param int $limit
	 */
	public function setLimit( $limit ) {
		$this->limit = (int)$limit;
	}

	/**
	 * Sets the batch size for processing. The batch size is used as the limit
	 * when listing IDs via the EntityIdPager::getNextBatchOfIds() method, and
	 * also controls the interval of progress reports.
	 *
	 * @param int $batchSize
	 *
	 * @throws InvalidArgumentException
	 */
	public function setBatchSize( $batchSize ) {
		if ( !is_int( $batchSize ) || $batchSize < 1 ) {
			throw new InvalidArgumentException( '$batchSize must be an integer >= 1' );
		}

		$this->batchSize = $batchSize;
	}

	/**
	 * @see setBatchSize()
	 *
	 * @return int
	 */
	public function getBatchSize() {
		return $this->batchSize;
	}

	/**
	 * @param MessageReporter $progressReporter
	 */
	public function setProgressReporter( MessageReporter $progressReporter ) {
		$this->progressReporter = $progressReporter;
	}

	/**
	 * @return MessageReporter
	 */
	public function getProgressReporter() {
		return $this->progressReporter;
	}

	/**
	 * @param ExceptionHandler $exceptionHandler
	 */
	public function setExceptionHandler( ExceptionHandler $exceptionHandler ) {
		$this->exceptionHandler = $exceptionHandler;
	}

	/**
	 * @return ExceptionHandler
	 */
	public function getExceptionHandler() {
		return $this->exceptionHandler;
	}

	/**
	 * Set the sharding factor and desired shard.
	 * For instance, to generate four dumps in parallel, use setShardingFilter( 4, 0 )
	 * for the first dump, setShardingFilter( 4, 1 ) for the second dump, etc.
	 *
	 * @param int $shardingFactor
	 * @param int $shard
	 *
	 * @throws InvalidArgumentException
	 */
	public function setShardingFilter( $shardingFactor, $shard ) {
		if ( !is_int( $shardingFactor ) || $shardingFactor < 1 ) {
			throw new InvalidArgumentException( '$shardingFactor must be a positive integer.' );
		}

		if ( !is_int( $shard ) || $shard < 0 ) {
			throw new InvalidArgumentException( '$shard must be a non-negative integer.' );
		}

		if ( $shard >= $shardingFactor ) {
			throw new InvalidArgumentException( '$shard must be smaller than $shardingFactor.' );
		}

		$this->shardingFactor = $shardingFactor;
		$this->shard = $shard;
	}

	/**
	 * Set the entity type to be included in the output.
	 *
	 * @param string|null $type The desired type (use null for any type).
	 */
	public function setEntityTypeFilter( $type ) {
		$this->entityType = $type;
	}

	private function idMatchesFilters( EntityId $entityId ) {
		return $this->idMatchesShard( $entityId ) && $this->idMatchesType( $entityId );
	}

	private function idMatchesShard( EntityId $entityId ) {
		// Shorten out
		if ( $this->shardingFactor === 1 ) {
			return true;
		}

		$hash = sha1( $entityId->getSerialization() );
		$shard = (int)hexdec( substr( $hash, 0, 8 ) ); // 4 bytes of the hash
		$shard = abs( $shard ); // avoid negative numbers on 32 bit systems
		$shard %= $this->shardingFactor; // modulo number of shards

		return $shard === $this->shard;
	}

	private function idMatchesType( EntityId $entityId ) {
		return $this->entityType === null || ( $entityId->getEntityType() === $this->entityType );
	}

	/**
	 * Writers the given string to the output provided to the constructor.
	 *
	 * @param string $data
	 */
	protected function writeToDump( $data ) {
		//TODO: use output stream object
		fwrite( $this->out, $data );
	}

	/**
	 * Do something before dumping data
	 */
	protected function preDump() {
		// Nothing by default
	}

	/**
	 * Do something after dumping data
	 */
	protected function postDump() {
		// Nothing by default
	}

	/**
	 * Do something before dumping entity
	 *
	 * @param int $dumpCount
	 */
	protected function preEntityDump( $dumpCount ) {
		// Nothing by default
	}

	/**
	 * Do something after dumping entity
	 *
	 * @param int $dumpCount
	 */
	protected function postEntityDump( $dumpCount ) {
		// Nothing by default
	}

	/**
	 * Generates a dump, writing to the file handle provided to the constructor.
	 *
	 * @param EntityIdPager $idPager
	 */
	public function generateDump( EntityIdPager $idPager ) {
		$dumpCount = 0;

		$this->preDump();

		// Iterate over batches of IDs, maintaining the current position of the pager in the $position variable.
		while ( true ) {
			$ids = $idPager->fetchIds( $this->batchSize );
			if ( !$ids ) {
				break;
			}

			$this->dumpEntities( $ids, $dumpCount );

			$this->progressReporter->reportMessage( 'Processed ' . $dumpCount . ' entities.' );

			if ( $this->limit && $dumpCount >= $this->limit ) {
				break;
			}
		}

		$this->postDump();
	}

	/**
	 * Dump list of entities
	 *
	 * @param EntityId[] $entityIds
	 * @param int &$dumpCount The number of entities already dumped (will be updated).
	 */
	private function dumpEntities( array $entityIds, &$dumpCount ) {
		$toLoad = array();
		foreach ( $entityIds as $entityId ) {
			if ( $this->idMatchesFilters( $entityId ) ) {
				$toLoad[] = $entityId;
			}
		}
		$this->entityPrefetcher->prefetch( $toLoad );

		foreach ( $toLoad as $entityId ) {
			try {
				$data = $this->generateDumpForEntityId( $entityId );
				if ( !$data ) {
					continue;
				}

				$this->preEntityDump( $dumpCount );
				$this->writeToDump( $data );
				$this->postEntityDump( $dumpCount );

				$dumpCount ++;
				if ( $this->limit && $dumpCount >= $this->limit ) {
					break;
				}
			} catch ( EntityLookupException $ex ) {
				$this->exceptionHandler->handleException( $ex, 'failed-to-dump', 'Failed to dump ' . $entityId );
			} catch ( StorageException $ex ) {
				$this->exceptionHandler->handleException( $ex, 'failed-to-dump', 'Failed to dump ' . $entityId );
			} catch ( LogicException $ex ) {
				$this->exceptionHandler->handleException( $ex, 'failed-to-dump', 'Failed to dump ' . $entityId );
			}
		}
	}

	/**
	 * Produce dump data for specific entity
	 *
	 * @param EntityId $entityId
	 *
	 * @throws EntityLookupException
	 * @throws StorageException
	 * @return string|null
	 */
	abstract protected function generateDumpForEntityId( EntityId $entityId );

}