I played around with this some today...
This attempts to essentially make it possible to use GetToken on the string.
1) It replaces the bracket & quote characters with other delimiters.
2) Then it restores those brackets which are within quotes. (Handling \['"] for escaping quotes, unlike XY's ''|"".)
3) Then it gets the specified token.
4) Locates the ending delimiter.
5) Replaces the other delimiters with their original values.
6) Wraps it back up in the outer brackets.
7) Returns the string.
Not sure if it works exactly as you wanted, or is any better, but it does avoid iterating over every character and hitting the regex engine. Though it introduces more loops.
For your big input it gives me this:
Code: Select all
Original: a:1:{s:9:"@test.bat {a}";a:1:{s:24:"AJXP_METADATA_SHAREDUSER";a:1:{s:10:"users_meta";a:1:{s:4:"tags";s:20:"batch file {new tag}";}}}}
0: Error: $lvl must be >= 1 or "count"!
1: {s:9:"@test.bat {a}";a:1:{s:24:"AJXP_METADATA_SHAREDUSER";a:1:{s:10:"users_meta";a:1:{s:4:"tags";s:20:"batch file {new tag}";}}}}
2: {s:24:"AJXP_METADATA_SHAREDUSER";a:1:{s:10:"users_meta";a:1:{s:4:"tags";s:20:"batch file {new tag}";}}}
3: {s:10:"users_meta";a:1:{s:4:"tags";s:20:"batch file {new tag}";}}
4: {s:4:"tags";s:20:"batch file {new tag}";}
5:
6:
Count: 5
EDIT: And yes it returns 5 for the count.. I'm not sure what you're expecting but there's a TODO for you to adjust it if need be. //edit.
Code: Select all
// FindFreeCharCode
// Returns the first charcode after startIdx which is not within str.
// Otherwise -1.
function FindFreeCharCode($str, $startIdx=0) {
if ($str == '') { return $startIdx; }
$i = $startIdx;
while ($i >= 0 && $i <= 65535) {
if (-1 == StrPos($str, chr($i), 0, true)) {
break;
}
$i++;
}
if ($i > 65535) {
return -1;
} else {
return $i;
}
}
function getSerializedData($str, $lvl = 1, $pattern = '{|}') {
if !($str) { return 'Error: $str is empty!'; }
if !($lvl) { return 'Error: $lvl must be >= 1 or "count"!'; }
// if !(isBalanced($str)) { return 'Error: $str does not contain serialized data!'; }
if (!$pattern || !regexmatches($pattern, "\|")) { return 'Error: $pattern syntax is invalid!'; }
$fToken = gettoken($pattern, 1, "|"); // First Token
$sToken = gettoken($pattern, 2, "|"); // Second Token
$lvlCount = ($lvl LikeI "count") ? 1 : 0; // Return number of occurrences instead of real data
// Find free characters to use for separators.
$openChar = FindFreeCharCode($str);
$endChar = FindFreeCharCode($str, $openChar+1);
$sQuoteChar = FindFreeCharCode($str, $endChar+1);
$dQuoteChar = FindFreeCharCode($str, $sQuoteChar+1);
$quoteChar = FindFreeCharCode($str, $dQuoteChar+1);
// Ensure none are <0
Assert $quoteChar >= 0, 'Could not find enough free characters...';
// Convert from index to chars.
$openChar = chr($openChar);
$endChar = chr($endChar);
$sQuoteChar = chr($sQuoteChar);
$dQuoteChar = chr($dQuoteChar);
$quoteChar = chr($quoteChar);
// TODO: Handle XY's double quote to escape.
// Currently this only handles the \['"] method of escaping.
// XY's method is a bigger pain in the butt because it also
// needs to be distinguished from empty strings.
// Replace tokens with new characters.
// It's tempting to use ReplaceList here...
// but then you need to find a separator that's not in anything else.
$newStr = Replace($str, $fToken, $openChar); // Starting Bracket
$newStr = Replace($newStr, $sToken, $endChar); // Ending Bracket
$newStr = Replace($newStr, "\'", $sQuoteChar); // Escaped Double Quotes
$newStr = Replace($newStr, '\"', $dQuoteChar); // Escaped Single Quotes
$newStr = Replace($newStr, "'", $quoteChar . "'"); // Single Quotes
$newStr = Replace($newStr, '"', $quoteChar . '"'); // Double Quotes
// Will hold an escaped version of the string.
$finalStr = '';
// Track if we're in quotes...
$inDQ = false;
$inSQ = false;
foreach ($t, $newStr, $quoteChar) {
// Get state of quotes.
// We do this first because we replaced each quote with $quoteChar.['"]
if ($t LikeI "'*" && ! $inDQ) {
$inSQ = ! $inSQ;
} elseif ($t LikeI '"*' && ! $inSQ) {
$inDQ = ! $inDQ;
}
// If in either set of quotes restore the brackets.
if ($inDQ || $inSQ) {
$t = Replace($t, $openChar, $fToken);
$t = Replace($t, $endChar, $sToken);
}
// Build up the escaped string.
$finalStr = $finalStr . $t;
}
$finalStr = Replace($finalStr, $quoteChar); // Shouldn't do anything.
// Restore escaped quotes.
$finalStr = Replace($finalStr, $sQuoteChar, "\'");
$finalStr = Replace($finalStr, $dQuoteChar, '\"');
// Special case for count.
if ($lvl LikeI 'count') {
// TODO: This may need to be modified to account for the outter level.
return GetToken($finalStr, $lvl, $openChar);
}
// Get the token (+1 to skip exterior)
$finalStr = GetToken($finalStr, $lvl+1, $openChar,, 2);
if ($finalStr == '') {
return '';
}
// Now we need to identify the correct closing bracket.
$nextOpen = -1;
$nextClose = -1;
while (true) {
$nextOpen = StrPos($finalStr, $openChar, $nextOpen+1);
$nextClose = StrPos($finalStr, $endChar, $nextClose+1);
if ($nextOpen == -1 || $nextClose < $nextOpen) {
break;
}
}
// Trim to the closing bracket.
if ($nextClose >= 0) {
$finalStr = SubStr($finalStr, 0, $nextClose);
}
// Replace our characters with the actual brackets.
$finalStr = Replace($finalStr, $openChar, $fToken);
$finalStr = Replace($finalStr, $endChar, $sToken);
// Restore surrounding brackets.
return $fToken . $finalStr . $sToken;
}
"Test"
$test = <<<'TESTS'
a:1:{s:9:"@test.bat {a}";a:1:{s:24:"AJXP_METADATA_SHAREDUSER";a:1:{s:10:"users_meta";a:1:{s:4:"tags";s:20:"batch file {new tag}";}}}}
a{b{c{d{f{g}}}}}
a"{}"b{c}
TESTS;
$results = '';
foreach ($a, $test, <crlf>) {
$t0 = getSerializedData($a, 0);
$t1 = getSerializedData($a, 1);
$t2 = getSerializedData($a, 2);
$t3 = getSerializedData($a, 3);
$t4 = getSerializedData($a, 4);
$t5 = getSerializedData($a, 5);
$t6 = getSerializedData($a, 6);
$tc = getSerializedData($a, 'count');
$results = <<<TEXT
$results
Original: $a
0: $t0
1: $t1
2: $t2
3: $t3
4: $t4
5: $t5
6: $t6
Count: $tc
TEXT;
}
Text $results;