Skip to content

Instantly share code, notes, and snippets.

@ninnypants
Created March 13, 2012 23:35
Show Gist options
  • Save ninnypants/2032680 to your computer and use it in GitHub Desktop.
Save ninnypants/2032680 to your computer and use it in GitHub Desktop.
Proxy that adjusts links
<?php
session_start();
function build_url($args = array()){
$defaults = array(
'scheme' => 'http',
'host' => '',
'path' => '',
'query' => '',
'user' => '',
'pass' => '',
'port' => '',
'fragment' => ''
);
$args = array_merge($defaults, $args);
$url = $args['scheme'].'://';
if(!empty($args['user'])){
$url .= $args['user'];
if(!empty($args['pass'])){
$url .= ':'.$args['pass'];
}
$url .= '@';
}
$url .= $args['host'].$args['port'].$args['path'];
if(!empty($args['query'])){
$url .= '?'.$args['query'];
}
$url .= $args['fragment'];
return $url;
}
function single_asset_replace($match, $content){
global $cur_dir, $base, $cur;
// find all instances of the asset
preg_match_all($match, $content, $matches);
$find = $matches[0];
$replace = array();
$cnt = count($matches[0]);
// do replaces based on the different types of relative urls
for($i = 0; $i < $cnt; $i++){
$na = trim($matches[2][$i]);
if(preg_match('#^\.#', $na)){
$na = $cur_dir.ltrim($na, '.');
}elseif(preg_match('#^/#', $na)){
$na = $base.$na;
}elseif(preg_match('#^\?#', $na)){
$na = $cur.$na;
}elseif(preg_match('#^\w#i', $na) && !preg_match('#^http(|s)://#i', $na)){
$na = $cur_dir.$na;
}
$replace[$i] = str_replace($matches[2][$i], $na, $matches[0][$i]);
}
//var_dump($find, $replace);
// replace throughout the content
return str_replace($find, $replace, $content);
}
function link_href_replace($content){
global $cur_dir, $base, $cur;
// find all links in the content
preg_match_all('#<a.*href\s*=\s*("|\')([^"\']+|)("|\')#Ui', $content, $href_matches);
//var_dump($href_matches);
$find = $href_matches[0];
$replace = array();
$cnt = count($href_matches[0]);
// do replaces based on the different types of relative urls
for($i = 0; $i < $cnt; $i++){
$nh = trim($href_matches[2][$i]);
if(preg_match('#^\.#', $nh)){
$nh = $cur_dir.ltrim($nh, '.');
}elseif(preg_match('#^/#', $nh)){
$nh = $base.$nh;
}elseif(preg_match('#^\?#', $nh)){
$nh = $cur.$nh;
}elseif(preg_match('#^\w#i', $nh) && !preg_match('#^http(|s)://#i', $nh)){
$nh = $cur_dir.$nh;
}
// replace http to work with tagperfect servers
$nh = preg_replace('#^http#', '', $nh);
if(empty($href_matches[2][$i]) && preg_match('#\s+#', $href_matches[2][$i])){
$replace[$i] = $href_macthes[0][$i];
}else{
$replace[$i] = str_replace($href_matches[2][$i], '?url='.urlencode($nh), $href_matches[0][$i]);
}
}
//var_dump($replace);
// return updated content
return str_replace($find, $replace, $content);
}
function get_site_cookies(){
global $urlp;
$ret = array();
$host_pieces = explode('.', $urlp['host']);
// get tld before you start looping
$host = array_pop($host_pieces);
// loop through the cookies stored in the session and find all
// cookies that can be sent
foreach($host_pieces as $piece){
$host = array_pop($host_pieces).'.'.$host;
#$_SESSION['cookies'] = array();
#var_dump($_SESSION['cookies']); exit;
if(!empty($_SESSION['cookies'])) {
foreach($_SESSION['cookies'] as $cookie){
if(preg_match('#domain=(\.|)'.$host.'#', $cookie)){
$ret[] = $cookie;
}
}
}
}
return $ret;
}
// append a domain to cookies that come through without "domain set"
function cookie_domain($cookie){
global $urlp;
$host_pieces = explode('.', $urlp['host']);
$domain = array_pop($host_pieces);
$domain = array_pop($host_pieces).'.'.$domain;
if(strpos('domain=', $cookie) === false){
return $cookie.'; domain=.'.$domain;
}
return $cookie;
}
if(!isset($_GET['url']) || empty($_GET['url'])){
?>
<!DOCTYPE html>
<html>
<head>
</head>
<body>
<form method="get" action="">
<input type="text" name="url" />
<input type="submit" value="Go" />
</form>
</body>
</html>
<?php
exit;
}
// location used for the cookie files
$loc = dirname(__FILE__);
// make sure htmlentities didn't slip into the url
$url = str_replace('&amp;', '&', trim($_GET['url']));
// make sure the url starts with at least http://
if(!preg_match('#^(|s)://#i', $url)){
$url = 'http://'.$url;
}else{
$url = 'http'.$url;
}
// parse the url so it can be rebuilt for certain relative link cases
$urlp = parse_url($url);
// build a url for the current url path no query string
$cur = build_url(array(
'scheme' => $urlp['scheme'],
'host' => $urlp['host'],
'path' => $urlp['path']
));
// fix dir issues when referencing the current directory
// dir_name will remove the last directory name from the path
// if the path does not have a file name at the end
if(strlen(strrchr($urlp['path'], '/')) === 1){
$cdir_path = $urlp['path'];
}else{
$cdir_path = dirname($urlp['path']);
}
// fix windows directory seporator
$cdir_path = str_replace('\\', '/', $cdir_path);
$cur_dir = build_url(array(
'scheme' => $urlp['scheme'],
'host' => $urlp['host'],
'path' => $cdir_path
));
// base site url for use with links that start with /
$base = build_url(array(
'scheme' => $urlp['scheme'],
'host' => $urlp['host']
));
// store method type if it's a form submission
$method = isset($_GET['method']) ? strtolower($_GET['method']) : '';
// if it's a get request build an array out of everything that
// is sent through $url and $_POST
if($method == 'get'){
if(isset($urlp['query'])){
$query_vars = parse_str($urlp['query']);
$query_vars = array_merge($query_vars, $_POST);
}else{
$query_vars = $_POST;
}
$urlp['query'] = $query_vars;
$url = build_url($urlp);
}
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
// curl_setopt($ch, CURLOPT_COOKIEFILE, $loc.'/cookie.txt');
// curl_setopt($ch, CURLOPT_COOKIEJAR, $loc.'/cookie.txt');
curl_setopt($ch, CURLOPT_COOKIE, implode('; ', get_site_cookies()));
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, array('Expect:'));
curl_setopt($ch, CURLINFO_HEADER_OUT, true);
// if the form submission method was post send the $_POST variable along
if($method == 'post'){
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $_POST);
}
// turn off peer verification for https to avoid verification issues
if($urlp['scheme']=='https'){
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
}
$content = curl_exec($ch);
curl_close($ch);
// process the content
// extract cookies
// store cookies
preg_match_all('#Set-Cookie: (.*)#', $content, $cookiematch);
foreach($cookiematch[1] as $cookie){
$_SESSION['cookies'][] = cookie_domain($cookie);
}
// echo '<pre>';
// echo $content;
// remove headers from the content
$content = preg_replace('#^(HTTP.*?(\r?\n){2})+#is', '', $content);
// modify all hrefs so that they go through the proxy
$content = link_href_replace($content);
// asset handling
$content = single_asset_replace('#<img.*src\s*=\s*("|\')([^"\']+|)("|\')#Ui', $content);
// echo htmlentities($content);
// echo '##############################################################';
$content = single_asset_replace('#<script.*src\s*=\s*("|\')([^"\']+|)("|\')#Ui', $content);
// echo htmlentities($content);
// echo '##############################################################';
$content = single_asset_replace('#<link.*href\s*=\s*("|\')([^"\']+|)("|\')#Ui', $content);
// echo htmlentities($content);
// echo '##############################################################';
// make sure the meta refreshes go through our script
preg_match_all('#<meta(.*http-equiv\s*=\s*"refresh".*content\s*=\s*"\d;url=([^"]+)"|.*content\s*=\s*"\d;url=([^"]+)".*http-equiv\s*=\s*"refresh")#Ui', $content, $refresh_matches);
$find = $refresh_matches[0];
$replace = array();
$cnt = count($refresh_matches[0]);
for($i = 0; $i < $cnt; $i++){
// set keys to be used for matching
// order action method
if(preg_match('#http-equiv\s*=\s*"refresh".*content\s*=\s*"\d;url=([^"]+)"#i', $refresh_matches[1][$i])){
$key = 2;
}else{
$key = 3;
}
$nu = $refresh_matches[0][$i];
// modify the refreshs action to use our script
$uri = $refresh_matches[$key][$i];
if(preg_match('#^\.#', $uri)){
$action = $cur_dir.ltrim($uri, '.');
}elseif(preg_match('#^/#', $uri)){
$action = $base.$action;
}elseif(preg_match('#^\?#', $uri)){
$uri = $cur.$uri;
}
$uri = urlencode($uri);
// build action query sting
$uri = '?url='.$uri;
// replace action
$nu = str_replace($refresh_matches[$key][$i], $uri, $nu);
$replace[$i] = $nu;
}
$content = str_replace($find, $replace, $content);
// send all forms through post then send the values to the remote site
preg_match_all('#<form(.*action\s*=\s*"([^"]*)".*method\s*=\s*"([^"]+)"|.*method\s*=\s*"([^"]+)".*action\s*=\s*"([^"]*)")#Ui', $content, $form_matches);
$find = $form_matches[0];
$replace = array();
$cnt = count($form_matches[0]);
for($i = 0; $i < $cnt; $i++){
// set keys to be used for matching
// order action method
if(preg_match('#.*action\s*=\s*"([^"]*)".*method\s*=\s*"([^"]+)"#i', $form_matches[1][$i])){
$keys = array(2, 3);
}else{
$keys = array(5, 4);
}
$nf = $form_matches[0][$i];
// modify the forms action to use our script
$action = $form_matches[$keys[0]][$i];
if(preg_match('#^\.#', $action)){
$action = $cur_dir.ltrim($action, '.');
}elseif(preg_match('#^/#', $action)){
$action = $base.$action;
}elseif(preg_match('#^\?#', $action)){
$action = $cur.$action;
}else{
$action = $cur;
}
$action = urlencode($action);
// build action query sting
$action = '?url='.$action.'&method='.$form_matches[$keys[1]][$i];
// replace action
if(empty($form_matches[$keys[0]][$i])){
$nf = str_replace('action=""', 'action="'.$action.'"', $nf);
}else{
$nf = str_replace($form_matches[$keys[0]][$i], $action, $nf);
}
if(strtolower($form_matches[$keys[1]][$i]) != 'post'){
$nf = str_replace($form_matches[$keys[1]][$i], 'post', $nf);
}
$replace[$i] = $nf;
}
$content = str_replace($find, $replace, $content);
echo $content;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment