| Current File : /home/jvzmxxx/wiki1/extensions/Flow/maintenance/repair_missing_revision_content.php |
<?php
$IP = getenv( 'MW_INSTALL_PATH' );
if ( $IP === false ) {
$IP = dirname( __FILE__ ) . '/../../..';
}
require_once "$IP/maintenance/commandLine.inc";
require_once "$IP/extensions/Flow/FlowActions.php";
$moderationChangeTypes = array(
'hide-post',
'hide-topic',
'delete-post',
'delete-topic',
'suppress-post',
'suppress-topic',
'lock-topic',
'restore-post',
'restore-topic',
);
$plaintextChangeTypes = array(
'edit-title',
'new-topic',
);
$csvOutput = fopen( 'repair_results_' . wfWikiID() . '.csv', 'w' );
if ( !$csvOutput ) {
die( "Could not open results file\n" );
}
fputcsv( $csvOutput, array( "uuid", "esurl", "flags" ) );
$it = new BatchRowIterator(
Flow\Container::get( 'db.factory' )->getDB( DB_SLAVE ),
'flow_revision',
array( 'rev_id' ),
10
);
$it->addConditions( array( 'rev_user_wiki' => wfWikiID() ) );
$it->setFetchColumns( array( 'rev_content', 'rev_content_length', 'rev_change_type', 'rev_parent_id' ) );
$dbr = wfGetDB( DB_SLAVE );
$totalMissingConsidered = 0;
$totalCompleteMatch = 0;
$totalMultipleMatches = 0;
$totalResolvedMultipleMatches = 0;
$totalNoMatch = 0;
$totalNoChangeRevisions = 0;
$totalMatchButInvalid = 0;
foreach ( $it as $batch ) {
foreach ( $batch as $rev ) {
$item = ExternalStore::fetchFromURL( $rev->rev_content );
if ( $item ) {
// contains valid data
continue;
}
++$totalMissingConsidered;
$uuid = Flow\Model\UUID::create( $rev->rev_id );
echo "\n********************\n\nProcessing revision " . $uuid->getAlphadecimal() . "\n";
$tsEscaped = $dbr->addQuotes( $uuid->getTimestamp( TS_MW ) );
$changeType = $rev->rev_change_type;
while( is_string( $wgFlowActions[$changeType] ) ) {
$changeType = $wgFlowActions[$changeType];
}
if ( in_array( $changeType, $moderationChangeTypes ) ) {
$totalNoChangeRevisions++;
echo "Revision inherits parent content, not searching\n";
continue;
}
// Collect 10 core revisions each before and after our revision
$before = query_revisions( $dbr, '<=', $tsEscaped );
$after = query_revisions( $dbr, '>', $tsEscaped );
$first = reset( $before );
$last = end( $after );
echo "Considering core revisions from " . $first->rev_timestamp . " to " . $last->rev_timestamp . "\n";
$esIdsForCluster = array();
foreach ( array( $before, $after ) as $results ) {
foreach ( $results as $row ) {
$parts = explode( '/', $row->old_text );
if ( isset( $parts[4] ) ) {
// Part of a multi-revision blob. This was not created
// at rev_timestamp
continue;
}
$cluster = $parts[2];
$id = (int)$parts[3];
$esIdsForCluster[$cluster][] = $id;
}
}
// find any gaps in ES within this area
$matches = $lengths = array();
$invalid = false;
$flags = 'utf-8,gzip,external';
if ( in_array( $changeType, $plaintextChangeTypes ) ) {
$flags .= ',wikitext';
} else {
$flags .= ',html';
}
echo "Expected length: " . $rev->rev_content_length . "\n";
foreach ( array_keys( $esIdsForCluster ) as $cluster ) {
sort( $esIdsForCluster[$cluster] );
$lastId = reset( $esIdsForCluster[$cluster] );
foreach ( $esIdsForCluster[$cluster] as $id ) {
if ( $id === $lastId || $id === $lastId + 1 ) {
$lastId = $id;
continue;
}
$range = range( $lastId + 1, $id - 1 );
$lastId = $id;
echo "Checking " . count( $range ) . " es urls\n";
if ( count( $range ) > 100 ) {
echo "More than 100 potential es urls, skipping\n";
$invalid = true;
continue;
}
foreach ( $range as $possible ) {
$url = "DB://$cluster/$possible";
$content = gzinflate( ExternalStore::fetchFromURL( $url ) );
if ( false !== @unserialize( $content ) ) {
// if it unserializes, its not our content
continue;
}
$json = @json_decode( $content, true );
if ( $json && count( $json ) === 1 && isset( $json['flow-workflow'] ) ) {
// while technically possible to be a topic title, i'm almost
// certain this is a core revisions inserted by flow in the form
// of: {"flow-workflow":"sbk26yv6cpcxxm87"}
continue;
}
if ( !in_array( $changeType, $plaintextChangeTypes ) ) {
if ( false === strpos( $content, 'data-parsoid' ) ) {
continue;
}
$content = parsoid_to_wikitext( $content );
}
$len = mb_strlen( $content );
if ( $rev->rev_content_length == $len ) {
$doAppend = true;
foreach ( $matches as $match ) {
if ( $match[1] === $content ) {
$doAppend = false;
break;
}
}
if ( $doAppend ) {
$matches[] = array( $url, $content, $flags );
}
} else {
$lengths[] = $len;
}
}
}
}
if ( $invalid && count( $matches ) === 1 ) {
echo "MATCHED BUT INVALID\n";
var_dump( $matches );
++$totalMatchButInvalid;
} elseif ( $invalid || !$matches ) {
echo "NO MATCH\n";
var_dump( $matches );
var_dump( $lengths );
++$totalNoMatch;
} elseif ( count( $matches ) === 1 ) {
list( $url, $content, $flags ) = reset( $matches );
echo "SINGLE DIRECT MATCH: $url : " . truncate( $content, 1024 ) . "\n";
++$totalCompleteMatch;
fputcsv( $csvOutput, array( $uuid->getAlphadecimal(), $url, $flags ) );
} else {
echo "MULTIPLE POTENTIAL MATCHES:\n";
++$totalMultipleMatches;
$multipleMatches[$uuid->getAlphadecimal()] = $matches;
foreach ( $matches as $match ) {
list( $url, $content, $flags ) = $match;
echo "\t$url : " . truncate( $content, 1024 ) . "\n";
}
}
}
}
if ( $multipleMatches ) {
echo "\n********************\n\nAttempting to resolve multiple match sets\n";
while ( $multipleMatches ) {
echo "\n********************\n\n";
// Grab the first key/value pair from $multipleMatches as our
// first matching group
$current = reset( $multipleMatches );
$group = array(
key( $multipleMatches ) => $current,
);
array_shift( $multipleMatches );
// Look for other revisions in $multipleMatches that matched at least
// one of the same pieces of ExternalStore data.
do {
$repeat = false;
foreach ( $multipleMatches as $uuid => $matches ) {
foreach ( $matches as $idx => $subMatch ) {
if ( array_search( $subMatch, $current ) !== false ) {
$group[$uuid] = $matches;
// expand $current to contain all individual
// external store data represented in $group
$current = array_merge( $current, $matches );
unset( $multipleMatches[$uuid] );
// because $current has expanded we need to go
// back to the begining of $multipleMatches
$repeat = true;
break 2;
}
}
}
} while ( $repeat );
// Look through all the data in $group. Consider it a match if every
// revision matched the exact same set of external store urls.
// Basically what we are looking for is, for example, a set of 3 revisions
// that all matched the same 3 pieces of ExternalStore data.
$valid = true;
$expectedMatches = reset( $group );
foreach ( $group as $uuid => $matches ) {
if ( count( $matches ) !== count( $group ) ) {
echo "Number of matches does not line up: " . count( $matches ) . " !== " . count( $group ) . "\n";
$valid = false;
break;
}
if ( $matches != $expectedMatches ) {
echo "Matched subsets do not line up: " . json_encode( $matches ) . " != " . json_encode( $expectedMatches ) . "\n";
$valid = false;
break;
}
}
if ( $valid ) {
// We have multiple revisions that all matched the exact same external
// store data. Make the assumption that those revisions and the es id's
// were created in a strictly ordered fashion, such that the first revision
// lines up with the first ES id, the second with the second, etc.
echo "declare victory!\n";
foreach ( array_keys( $group ) as $uuid ) {
$match = array_shift( $expectedMatches );
list( $url, $content, $flags ) = $match;
fputcsv( $csvOutput, array( $uuid, $url, $flags ) );
--$totalMultipleMatches;
++$totalResolvedMultipleMatches;
}
} else {
var_dump( $group );
}
}
echo "\n********************\n";
}
echo "\n\n\nLooked at $totalMissingConsidered flow revisions\n";
echo "Found matches for $totalCompleteMatch (" . number_format( 100 * $totalCompleteMatch / $totalMissingConsidered ) . "%)\n";
echo "Found multiple matches for $totalMultipleMatches (" . number_format( 100 * $totalMultipleMatches / $totalMissingConsidered ) . "%)\n";
echo "Found no match for $totalNoMatch (" . number_format( 100 * $totalNoMatch / $totalMissingConsidered ) . "%)\n";
echo "Found $totalNoChangeRevisions that will inherit parent content (" . number_format( 100 * $totalNoChangeRevisions / $totalMissingConsidered ) . "%)\n";
echo "Found a match but invalid due to size of es gaps for $totalMatchButInvalid (" . number_format( 100 * $totalMatchButInvalid / $totalMissingConsidered ). "%)\n";
echo "Resolved $totalResolvedMultipleMatches multiple matches (" . number_format( 100 * $totalResolvedMultipleMatches / $totalMissingConsidered ) . "%)\n";
function query_revisions( $dbr, $op, $tsEscaped ) {
$direction = $op[0] === '>' ? 'ASC' : 'DESC';
$sql =
"SELECT revision.rev_timestamp, text.old_text
FROM revision
JOIN text ON revision.rev_text_id = old_id
LEFT JOIN revision parent ON parent.rev_id = revision.rev_parent_id
WHERE revision.rev_timestamp $op $tsEscaped
AND revision.rev_text_id <> parent.rev_text_id
ORDER BY revision.rev_timestamp $direction
LIMIT 10";
$res = $dbr->query( $sql, __METHOD__ );
return iterator_to_array( $res );
}
function parsoid_to_wikitext( $content, $retry = 3 ) {
static $cache = array();
$hash = md5( $content );
if ( isset( $cache[$hash] ) ) {
return $cache[$hash];
}
try {
$wikitext = Flow\Conversion\Utils::convert( 'html', 'wt', $content, Title::newMainPage() );
return $cache[$hash] = $wikitext;
} catch ( Flow\Exception\NoParserException $e ) {
echo "failed to convert to wikitext: " . truncate( $content, 1024 ) . "\n";
return $cache[$hash] = $content;
}
}
function truncate( $string, $length ) {
if ( strlen( $string ) > $length ) {
return substr( $string, 0, $length ) . '...';
} else {
return $string;
}
}