Skip to content

Instantly share code, notes, and snippets.

@thomas-mcdonald
Created February 2, 2013 12:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thomas-mcdonald/4697086 to your computer and use it in GitHub Desktop.
Save thomas-mcdonald/4697086 to your computer and use it in GitHub Desktop.
MediaWiki internal link handler
function replaceInternalLinks2( &$s ) {
static $tc = FALSE, $e1, $e1_img;
# the % is needed to support urlencoded titles as well
if ( !$tc ) {
$tc = Title::legalChars() . '#%';
# Match a link having the form [[namespace:link|alternate]]trail
$e1 = "/^([{$tc}]+)(?:\\|(.+?))?]](.*)\$/sD";
# Match cases where there is no "]]", which might still be images
$e1_img = "/^([{$tc}]+)\\|(.*)\$/sD";
}
$holders = new LinkHolderArray( $this );
# split the entire text string on occurrences of [[
$a = StringUtils::explode( '[[', ' ' . $s );
# get the first element (all text up to first [[), and remove the space we added
$s = $a->current();
$a->next();
$line = $a->current(); # Workaround for broken ArrayIterator::next() that returns "void"
$s = substr( $s, 1 );
$useLinkPrefixExtension = $this->getTargetLanguage()->linkPrefixExtension();
$e2 = null;
if ( $useLinkPrefixExtension ) {
# Match the end of a line for a word that's not followed by whitespace,
# e.g. in the case of 'The Arab al[[Razi]]', 'al' will be matched
$e2 = wfMessage( 'linkprefix' )->inContentLanguage()->text();
}
$nottalk = !$this->mTitle->isTalkPage();
if ( $useLinkPrefixExtension ) {
$m = array();
if ( preg_match( $e2, $s, $m ) ) {
$first_prefix = $m[2];
} else {
$first_prefix = false;
}
} else {
$prefix = '';
}
$useSubpages = $this->areSubpagesAllowed();
wfProfileOut( __METHOD__ . '-setup' );
# Loop for each link
for ( ; $line !== false && $line !== null ; $a->next(), $line = $a->current() ) {
if ( $useLinkPrefixExtension ) {
wfProfileIn( __METHOD__ . '-prefixhandling' );
if ( preg_match( $e2, $s, $m ) ) {
$prefix = $m[2];
$s = $m[1];
} else {
$prefix = '';
}
# first link
if ( $first_prefix ) {
$prefix = $first_prefix;
$first_prefix = false;
}
wfProfileOut( __METHOD__ . '-prefixhandling' );
}
$might_be_img = false;
wfProfileIn( __METHOD__ . "-e1" );
if ( preg_match( $e1, $line, $m ) ) { # page with normal text or alt
$text = $m[2];
# If we get a ] at the beginning of $m[3] that means we have a link that's something like:
# [[Image:Foo.jpg|[http://example.com desc]]] <- having three ] in a row fucks up,
# the real problem is with the $e1 regex
# See bug 1300.
#
# Still some problems for cases where the ] is meant to be outside punctuation,
# and no image is in sight. See bug 2095.
#
if ( $text !== '' && substr( $m[3], 0, 1 ) === ']' && strpos( $text, '[' ) !== false )
{
$text .= ']'; # so that replaceExternalLinks($text) works later
$m[3] = substr( $m[3], 1 );
}
# fix up urlencoded title texts
if ( strpos( $m[1], '%' ) !== false ) {
# Should anchors '#' also be rejected?
$m[1] = str_replace( array('<', '>'), array('&lt;', '&gt;'), rawurldecode( $m[1] ) );
}
$trail = $m[3];
} elseif ( preg_match( $e1_img, $line, $m ) ) { # Invalid, but might be an image with a link in its caption
$might_be_img = true;
$text = $m[2];
if ( strpos( $m[1], '%' ) !== false ) {
$m[1] = rawurldecode( $m[1] );
}
$trail = "";
} else { # Invalid form; output directly
$s .= $prefix . '[[' . $line ;
wfProfileOut( __METHOD__ . "-e1" );
continue;
}
wfProfileOut( __METHOD__ . "-e1" );
wfProfileIn( __METHOD__ . "-misc" );
# Don't allow internal links to pages containing
# PROTO: where PROTO is a valid URL protocol; these
# should be external links.
if ( preg_match( '/^(?i:' . $this->mUrlProtocols . ')/', $m[1] ) ) {
$s .= $prefix . '[[' . $line ;
wfProfileOut( __METHOD__ . "-misc" );
continue;
}
# Make subpage if necessary
if ( $useSubpages ) {
$link = $this->maybeDoSubpageLink( $m[1], $text );
} else {
$link = $m[1];
}
$noforce = ( substr( $m[1], 0, 1 ) !== ':' );
if ( !$noforce ) {
# Strip off leading ':'
$link = substr( $link, 1 );
}
wfProfileOut( __METHOD__ . "-misc" );
wfProfileIn( __METHOD__ . "-title" );
$nt = Title::newFromText( $this->mStripState->unstripNoWiki( $link ) );
if ( $nt === null ) {
$s .= $prefix . '[[' . $line;
wfProfileOut( __METHOD__ . "-title" );
continue;
}
$ns = $nt->getNamespace();
$iw = $nt->getInterWiki();
wfProfileOut( __METHOD__ . "-title" );
if ( $might_be_img ) { # if this is actually an invalid link
wfProfileIn( __METHOD__ . "-might_be_img" );
if ( $ns == NS_FILE && $noforce ) { # but might be an image
$found = false;
while ( true ) {
# look at the next 'line' to see if we can close it there
$a->next();
$next_line = $a->current();
if ( $next_line === false || $next_line === null ) {
break;
}
$m = explode( ']]', $next_line, 3 );
if ( count( $m ) == 3 ) {
# the first ]] closes the inner link, the second the image
$found = true;
$text .= "[[{$m[0]}]]{$m[1]}";
$trail = $m[2];
break;
} elseif ( count( $m ) == 2 ) {
# if there's exactly one ]] that's fine, we'll keep looking
$text .= "[[{$m[0]}]]{$m[1]}";
} else {
# if $next_line is invalid too, we need look no further
$text .= '[[' . $next_line;
break;
}
}
if ( !$found ) {
# we couldn't find the end of this imageLink, so output it raw
# but don't ignore what might be perfectly normal links in the text we've examined
$holders->merge( $this->replaceInternalLinks2( $text ) );
$s .= "{$prefix}[[$link|$text";
# note: no $trail, because without an end, there *is* no trail
wfProfileOut( __METHOD__ . "-might_be_img" );
continue;
}
} else {
# it's not an image, so output it raw
$s .= "{$prefix}[[$link|$text";
# note: no $trail, because without an end, there *is* no trail
wfProfileOut( __METHOD__ . "-might_be_img" );
continue;
}
wfProfileOut( __METHOD__ . "-might_be_img" );
}
$wasblank = ( $text == '' );
if ( $wasblank ) {
$text = $link;
} else {
# Bug 4598 madness. Handle the quotes only if they come from the alternate part
# [[Lista d''e paise d''o munno]] -> <a href="...">Lista d''e paise d''o munno</a>
# [[Criticism of Harry Potter|Criticism of ''Harry Potter'']]
# -> <a href="Criticism of Harry Potter">Criticism of <i>Harry Potter</i></a>
$text = $this->doQuotes( $text );
}
# Link not escaped by : , create the various objects
if ( $noforce ) {
# Interwikis
wfProfileIn( __METHOD__ . "-interwiki" );
if ( $iw && $this->mOptions->getInterwikiMagic() && $nottalk && Language::fetchLanguageName( $iw, null, 'mw' ) ) {
// XXX: the above check prevents links to sites with identifiers that are not language codes
# Bug 24502: filter duplicates
if ( !isset( $this->mLangLinkLanguages[$iw] ) ) {
$this->mLangLinkLanguages[$iw] = true;
$this->mOutput->addLanguageLink( $nt->getFullText() );
}
$s = rtrim( $s . $prefix );
$s .= trim( $trail, "\n" ) == '' ? '': $prefix . $trail;
wfProfileOut( __METHOD__ . "-interwiki" );
continue;
}
wfProfileOut( __METHOD__ . "-interwiki" );
if ( $ns == NS_FILE ) {
wfProfileIn( __METHOD__ . "-image" );
if ( !wfIsBadImage( $nt->getDBkey(), $this->mTitle ) ) {
if ( $wasblank ) {
# if no parameters were passed, $text
# becomes something like "File:Foo.png",
# which we don't want to pass on to the
# image generator
$text = '';
} else {
# recursively parse links inside the image caption
# actually, this will parse them in any other parameters, too,
# but it might be hard to fix that, and it doesn't matter ATM
$text = $this->replaceExternalLinks( $text );
$holders->merge( $this->replaceInternalLinks2( $text ) );
}
# cloak any absolute URLs inside the image markup, so replaceExternalLinks() won't touch them
$s .= $prefix . $this->armorLinks(
$this->makeImage( $nt, $text, $holders ) ) . $trail;
} else {
$s .= $prefix . $trail;
}
wfProfileOut( __METHOD__ . "-image" );
continue;
}
if ( $ns == NS_CATEGORY ) {
wfProfileIn( __METHOD__ . "-category" );
$s = rtrim( $s . "\n" ); # bug 87
if ( $wasblank ) {
$sortkey = $this->getDefaultSort();
} else {
$sortkey = $text;
}
$sortkey = Sanitizer::decodeCharReferences( $sortkey );
$sortkey = str_replace( "\n", '', $sortkey );
$sortkey = $this->getConverterLanguage()->convertCategoryKey( $sortkey );
$this->mOutput->addCategory( $nt->getDBkey(), $sortkey );
$s .= trim( $prefix . $trail, "\n" ) == '' ? '' : $prefix . $trail;
wfProfileOut( __METHOD__ . "-category" );
continue;
}
}
# Self-link checking
if ( $nt->getFragment() === '' && $ns != NS_SPECIAL ) {
if ( $nt->equals( $this->mTitle ) || ( !$nt->isKnown() && in_array($this->mTitle->getPrefixedText(),$this->getConverterLanguage()->autoConvertToAllVariants( $nt->getPrefixedText() ),true) ) ) {
$s .= $prefix . Linker::makeSelfLinkObj( $nt, $text, '', $trail );
continue;
}
}
# NS_MEDIA is a pseudo-namespace for linking directly to a file
# @todo FIXME: Should do batch file existence checks, see comment below
if ( $ns == NS_MEDIA ) {
wfProfileIn( __METHOD__ . "-media" );
# Give extensions a chance to select the file revision for us
$options = array();
$descQuery = false;
wfRunHooks( 'BeforeParserFetchFileAndTitle', array( $this, $nt, &$options, &$descQuery ) );
# Fetch and register the file (file title may be different via hooks)
list( $file, $nt ) = $this->fetchFileAndTitle( $nt, $options );
# Cloak with NOPARSE to avoid replacement in replaceExternalLinks
$s .= $prefix . $this->armorLinks(Linker::makeMediaLinkFile( $nt, $file, $text ) ) . $trail;
wfProfileOut( __METHOD__ . "-media" );
continue;
}
wfProfileIn( __METHOD__ . "-always_known" );
# Some titles, such as valid special pages or files in foreign repos, should
# be shown as bluelinks even though they're not included in the page table
#
# @todo FIXME: isAlwaysKnown() can be expensive for file links; we should really do
# batch file existence checks for NS_FILE and NS_MEDIA
if ( $iw == '' && $nt->isAlwaysKnown() ) {
$this->mOutput->addLink( $nt );
$s .= $this->makeKnownLinkHolder( $nt, $text, array(), $trail, $prefix );
} else {
# Links will be added to the output link list after checking
$s .= $holders->makeHolder( $nt, $text, array(), $trail, $prefix );
}
wfProfileOut( __METHOD__ . "-always_known" );
}
wfProfileOut( __METHOD__ );
return $holders;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment