Skip to content

Commit 2ba271b

Browse files
Sven Hagemannjschroed91
authored andcommitted
Issue #77: Performance Fixes (#81)
* Updated the performance fixture * Added MbStringUtil as a wrapper around string functions to use mb_* functions only when necessary * Added mutlibyte test * Cleanup: Cleanup using Tidy extension code path was disabled a long time ago, pretty sure that it can be removed * Removed the overhead of checking using strlen(), using strict string compare is about 30% faster * Added strict types and reduced the call graph, increased the performance by about 3%
1 parent 6f39bc3 commit 2ba271b

File tree

10 files changed

+137
-77
lines changed

10 files changed

+137
-77
lines changed

lib/Caxy/HtmlDiff/AbstractDiff.php

Lines changed: 12 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
namespace Caxy\HtmlDiff;
44

5+
use Caxy\HtmlDiff\Util\MbStringUtil;
6+
57
/**
68
* Class AbstractDiff.
79
*/
@@ -79,6 +81,11 @@ abstract class AbstractDiff
7981
*/
8082
protected $resetCache = false;
8183

84+
/**
85+
* @var MbStringUtil
86+
*/
87+
protected $stringUtil;
88+
8289
/**
8390
* AbstractDiff constructor.
8491
*
@@ -90,7 +97,7 @@ abstract class AbstractDiff
9097
*/
9198
public function __construct($oldText, $newText, $encoding = 'UTF-8', $specialCaseTags = null, $groupDiffs = null)
9299
{
93-
mb_substitute_character(0x20);
100+
$this->stringUtil = new MbStringUtil($oldText, $newText);
94101

95102
$this->setConfig(HtmlDiffConfig::create()->setEncoding($encoding));
96103

@@ -389,44 +396,13 @@ protected function getClosingTag($tag)
389396
return '</'.$tag.'>';
390397
}
391398

392-
/**
393-
* @param string $str
394-
* @param string $start
395-
* @param string $end
396-
*
397-
* @return string
398-
*/
399-
protected function getStringBetween($str, $start, $end)
400-
{
401-
$expStr = mb_split($start, $str, 2);
402-
if (count($expStr) > 1) {
403-
$expStr = mb_split($end, $expStr[ 1 ]);
404-
if (count($expStr) > 1) {
405-
array_pop($expStr);
406-
407-
return implode($end, $expStr);
408-
}
409-
}
410-
411-
return '';
412-
}
413-
414399
/**
415400
* @param string $html
416401
*
417402
* @return string
418403
*/
419404
protected function purifyHtml($html)
420405
{
421-
if (class_exists('Tidy') && false) {
422-
$config = array('output-xhtml' => true, 'indent' => false);
423-
$tidy = new tidy();
424-
$tidy->parseString($html, $config, 'utf8');
425-
$html = (string) $tidy;
426-
427-
return $this->getStringBetween($html, '<body>');
428-
}
429-
430406
return $this->purifier->purify($html);
431407
}
432408

@@ -493,7 +469,7 @@ protected function convertHtmlToListOfWords($characterString)
493469
$mode = 'whitespace';
494470
} else {
495471
if (
496-
(($this->ctypeAlphanumUnicode($character)) && (mb_strlen($current_word) == 0 || $this->isPartOfWord($current_word))) ||
472+
(($this->ctypeAlphanumUnicode($character) === true) && ($this->stringUtil->strlen($current_word) === 0 || $this->isPartOfWord($current_word))) ||
497473
(in_array($character, $this->config->getSpecialCaseChars()) && isset($characterString[$i + 1]) && $this->isPartOfWord($characterString[$i + 1]))
498474
) {
499475
$current_word .= $character;
@@ -554,7 +530,7 @@ protected function convertHtmlToListOfWords($characterString)
554530
*/
555531
protected function isStartOfTag($val)
556532
{
557-
return $val == '<';
533+
return $val === '<';
558534
}
559535

560536
/**
@@ -564,7 +540,7 @@ protected function isStartOfTag($val)
564540
*/
565541
protected function isEndOfTag($val)
566542
{
567-
return $val == '>';
543+
return $val === '>';
568544
}
569545

570546
/**
@@ -595,6 +571,6 @@ protected function explode($value)
595571
*/
596572
protected function ctypeAlphanumUnicode($str)
597573
{
598-
return preg_match("/^[a-zA-Z0-9\pL]+$/u", $str);
574+
return preg_match("/^[a-zA-Z0-9\pL]+$/u", $str) === 1;
599575
}
600576
}

lib/Caxy/HtmlDiff/HtmlDiff.php

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ protected function createIsolatedDiffTagPlaceholders(&$words)
158158
foreach ($words as $index => $word) {
159159
$openIsolatedDiffTag = $this->isOpeningIsolatedDiffTag($word, $currentIsolatedDiffTag);
160160
if ($openIsolatedDiffTag) {
161-
if ($this->isSelfClosingTag($word) || mb_stripos($word, '<img') !== false) {
161+
if ($this->isSelfClosingTag($word) || $this->stringUtil->stripos($word, '<img') !== false) {
162162
if ($openIsolatedDiffTags === 0) {
163163
$isolatedDiffTagIndices[] = array(
164164
'start' => $index,
@@ -543,7 +543,7 @@ protected function insertTag($tag, $cssClass, &$words)
543543
$specialCaseTagInjection = '';
544544
$specialCaseTagInjectionIsBefore = false;
545545

546-
if (count($nonTags) != 0) {
546+
if (count($nonTags) !== 0) {
547547
$text = $this->wrapText(implode('', $nonTags), $tag, $cssClass);
548548
$this->content .= $text;
549549
} else {
@@ -567,15 +567,15 @@ protected function insertTag($tag, $cssClass, &$words)
567567
}
568568
}
569569
}
570-
if (count($words) == 0 && mb_strlen($specialCaseTagInjection) == 0) {
570+
if (count($words) == 0 && $this->stringUtil->strlen($specialCaseTagInjection) == 0) {
571571
break;
572572
}
573573
if ($specialCaseTagInjectionIsBefore) {
574574
$this->content .= $specialCaseTagInjection.implode('', $this->extractConsecutiveWords($words, 'tag'));
575575
} else {
576576
$workTag = $this->extractConsecutiveWords($words, 'tag');
577577
if (isset($workTag[ 0 ]) && $this->isOpeningTag($workTag[ 0 ]) && !$this->isClosingTag($workTag[ 0 ])) {
578-
if (mb_strpos($workTag[ 0 ], 'class=')) {
578+
if ($this->stringUtil->strpos($workTag[ 0 ], 'class=')) {
579579
$workTag[ 0 ] = str_replace('class="', 'class="diffmod ', $workTag[ 0 ]);
580580
$workTag[ 0 ] = str_replace("class='", 'class="diffmod ', $workTag[ 0 ]);
581581
} else {
@@ -584,7 +584,7 @@ protected function insertTag($tag, $cssClass, &$words)
584584
}
585585

586586
$appendContent = implode('', $workTag).$specialCaseTagInjection;
587-
if (isset($workTag[0]) && false !== mb_stripos($workTag[0], '<img')) {
587+
if (isset($workTag[0]) && false !== $this->stringUtil->stripos($workTag[0], '<img')) {
588588
$appendContent = $this->wrapText($appendContent, $tag, $cssClass);
589589
}
590590
$this->content .= $appendContent;
@@ -698,7 +698,7 @@ protected function operations()
698698
$matches = $this->matchingBlocks();
699699
$matches[] = new Match(count($this->oldWords), count($this->newWords), 0);
700700

701-
foreach ($matches as $i => $match) {
701+
foreach ($matches as $match) {
702702
$matchStartsAtCurrentPositionInOld = ($positionInOld === $match->startInOld);
703703
$matchStartsAtCurrentPositionInNew = ($positionInNew === $match->startInNew);
704704

@@ -769,10 +769,10 @@ protected function findMatchingBlocks($startInOld, $endInOld, $startInNew, $endI
769769
*/
770770
protected function stripTagAttributes($word)
771771
{
772-
$space = mb_strpos($word, ' ', 1);
772+
$space = $this->stringUtil->strpos($word, ' ', 1);
773773

774774
if ($space) {
775-
return '<' . mb_substr($word, 1, $space) . '>';
775+
return '<' . $this->stringUtil->substr($word, 1, $space) . '>';
776776
}
777777

778778
return trim($word, '<>');
@@ -788,6 +788,7 @@ protected function stripTagAttributes($word)
788788
*/
789789
protected function findMatch($startInOld, $endInOld, $startInNew, $endInNew)
790790
{
791+
$groupDiffs = $this->isGroupDiffs();
791792
$bestMatchInOld = $startInOld;
792793
$bestMatchInNew = $startInNew;
793794
$bestMatchSize = 0;
@@ -816,7 +817,7 @@ protected function findMatch($startInOld, $endInOld, $startInNew, $endInNew)
816817

817818
if ($newMatchLength > $bestMatchSize ||
818819
(
819-
$this->isGroupDiffs() &&
820+
$groupDiffs &&
820821
$bestMatchSize > 0 &&
821822
$this->isOnlyWhitespace($this->array_slice_cached($this->oldWords, $bestMatchInOld, $bestMatchSize))
822823
)
@@ -830,9 +831,9 @@ protected function findMatch($startInOld, $endInOld, $startInNew, $endInNew)
830831
}
831832

832833
// Skip match if none found or match consists only of whitespace
833-
if ($bestMatchSize != 0 &&
834+
if ($bestMatchSize !== 0 &&
834835
(
835-
!$this->isGroupDiffs() ||
836+
!$groupDiffs ||
836837
!$this->isOnlyWhitespace($this->array_slice_cached($this->oldWords, $bestMatchInOld, $bestMatchSize))
837838
)
838839
) {
@@ -850,7 +851,7 @@ protected function findMatch($startInOld, $endInOld, $startInNew, $endInNew)
850851
protected function isOnlyWhitespace($str)
851852
{
852853
// Slightly faster then using preg_match
853-
return $str !== '' && (mb_strlen(trim($str)) === 0);
854+
return $str !== '' && trim($str) === '';
854855
}
855856

856857
/**

lib/Caxy/HtmlDiff/ListDiff.php

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ protected function buildDiffList($words)
233233
$list[] = $word;
234234
}
235235
} else {
236-
$listType = mb_substr($word, 1, 2);
236+
$listType = $this->stringUtil->substr($word, 1, 2);
237237
$listStartTag = $word;
238238
}
239239

@@ -254,7 +254,7 @@ protected function buildDiffList($words)
254254
if ($openListItems === 0) {
255255
// New top-level list item
256256
$currentListItem = array();
257-
$listItemType = mb_substr($word, 1, 2);
257+
$listItemType = $this->stringUtil->substr($word, 1, 2);
258258
$listItemStart = $word;
259259
} else {
260260
$currentListItem[] = $word;
@@ -290,27 +290,27 @@ protected function isOpeningListTag($word, $type = null)
290290
{
291291
$filter = $type !== null ? array('<'.$type) : array('<ul', '<ol', '<dl');
292292

293-
return in_array(mb_substr($word, 0, 3), $filter);
293+
return in_array($this->stringUtil->substr($word, 0, 3), $filter);
294294
}
295295

296296
protected function isClosingListTag($word, $type = null)
297297
{
298298
$filter = $type !== null ? array('</'.$type) : array('</ul', '</ol', '</dl');
299299

300-
return in_array(mb_substr($word, 0, 4), $filter);
300+
return in_array($this->stringUtil->substr($word, 0, 4), $filter);
301301
}
302302

303303
protected function isOpeningListItemTag($word, $type = null)
304304
{
305305
$filter = $type !== null ? array('<'.$type) : array('<li', '<dd', '<dt');
306306

307-
return in_array(mb_substr($word, 0, 3), $filter);
307+
return in_array($this->stringUtil->substr($word, 0, 3), $filter);
308308
}
309309

310310
protected function isClosingListItemTag($word, $type = null)
311311
{
312312
$filter = $type !== null ? array('</'.$type) : array('</li', '</dd', '</dt');
313313

314-
return in_array(mb_substr($word, 0, 4), $filter);
314+
return in_array($this->stringUtil->substr($word, 0, 4), $filter);
315315
}
316316
}

lib/Caxy/HtmlDiff/ListDiffLines.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ public function build()
6565
return $this->content;
6666
}
6767

68-
$matchStrategy = new ListItemMatchStrategy($this->config->getMatchThreshold());
68+
$matchStrategy = new ListItemMatchStrategy($this->stringUtil, $this->config->getMatchThreshold());
6969
$this->lcsService = new LcsService($matchStrategy);
7070

7171
return $this->listByLines($this->oldText, $this->newText);

lib/Caxy/HtmlDiff/Preprocessor.php

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,22 @@
44

55
class Preprocessor
66
{
7-
public static function diffCommonPrefix($old, $new)
7+
public static function diffCommonPrefix($old, $new, $stringUtil)
88
{
99
// Quick check for common null cases.
10-
if (mb_strlen($old) == 0 || mb_strlen($new) == 0 || mb_substr($old, 0, 1) != mb_substr($new, 0, 1)) {
10+
if ($stringUtil->strlen($old) == 0 || $stringUtil->strlen($new) == 0 || $stringUtil->substr($old, 0, 1) != $stringUtil->substr($new, 0, 1)) {
1111
return 0;
1212
}
1313

1414
// Binary Search
1515
$pointerMin = 0;
16-
$pointerMax = min(mb_strlen($old), mb_strlen($new));
16+
$pointerMax = min($stringUtil->strlen($old), $stringUtil->strlen($new));
1717
$pointerMid = $pointerMax;
1818
$pointerStart = 0;
1919
while ($pointerMin < $pointerMid) {
2020
$cmp = substr_compare(
2121
$old,
22-
mb_substr($new, $pointerStart, $pointerMid - $pointerStart),
22+
$stringUtil->substr($new, $pointerStart, $pointerMid - $pointerStart),
2323
$pointerStart,
2424
$pointerMid - $pointerStart
2525
);
@@ -34,22 +34,22 @@ public static function diffCommonPrefix($old, $new)
3434
return $pointerMid;
3535
}
3636

37-
public static function diffCommonSuffix($old, $new)
37+
public static function diffCommonSuffix($old, $new, $stringUtil)
3838
{
3939
// Quick check for common null cases.
40-
if (mb_strlen($old) == 0 || mb_strlen($new) == 0 || mb_substr($old, mb_strlen($old) - 1, 1) != mb_substr($new, mb_strlen($new) - 1, 1)) {
40+
if ($stringUtil->strlen($old) == 0 || $stringUtil->strlen($new) == 0 || $stringUtil->substr($old, $stringUtil->strlen($old) - 1, 1) != $stringUtil->substr($new, $stringUtil->strlen($new) - 1, 1)) {
4141
return 0;
4242
}
4343

4444
// Binary Search
4545
$pointerMin = 0;
46-
$pointerMax = min(mb_strlen($old), mb_strlen($new));
46+
$pointerMax = min($stringUtil->strlen($old), $stringUtil->strlen($new));
4747
$pointerMid = $pointerMax;
4848
$pointerEnd = 0;
49-
$oldLen = mb_strlen($old);
50-
$newLen = mb_strlen($new);
49+
$oldLen = $stringUtil->strlen($old);
50+
$newLen = $stringUtil->strlen($new);
5151
while ($pointerMin < $pointerMid) {
52-
if (mb_substr($old, $oldLen - $pointerMid, $pointerMid - $pointerEnd) == mb_substr($new, $newLen - $pointerMid, $pointerMid - $pointerEnd)) {
52+
if ($stringUtil->substr($old, $oldLen - $pointerMid, $pointerMid - $pointerEnd) == $stringUtil->substr($new, $newLen - $pointerMid, $pointerMid - $pointerEnd)) {
5353
$pointerMin = $pointerMid;
5454
$pointerEnd = $pointerMin;
5555
} else {

lib/Caxy/HtmlDiff/Strategy/ListItemMatchStrategy.php

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,15 @@
33
namespace Caxy\HtmlDiff\Strategy;
44

55
use Caxy\HtmlDiff\Preprocessor;
6+
use Caxy\HtmlDiff\Util\MbStringUtil;
67

78
class ListItemMatchStrategy implements MatchStrategyInterface
89
{
10+
/**
11+
* @var MbStringUtil
12+
*/
13+
protected $stringUtil;
14+
915
/**
1016
* @var int
1117
*/
@@ -24,12 +30,14 @@ class ListItemMatchStrategy implements MatchStrategyInterface
2430
/**
2531
* ListItemMatchStrategy constructor.
2632
*
27-
* @param int $similarityThreshold
28-
* @param float $lengthRatioThreshold
29-
* @param float $commonTextRatioThreshold
33+
* @param MbStringUtil $stringUtil
34+
* @param int $similarityThreshold
35+
* @param float $lengthRatioThreshold
36+
* @param float $commonTextRatioThreshold
3037
*/
31-
public function __construct($similarityThreshold = 80, $lengthRatioThreshold = 0.1, $commonTextRatioThreshold = 0.6)
38+
public function __construct($stringUtil, $similarityThreshold = 80, $lengthRatioThreshold = 0.1, $commonTextRatioThreshold = 0.6)
3239
{
40+
$this->stringUtil = $stringUtil;
3341
$this->similarityThreshold = $similarityThreshold;
3442
$this->lengthRatioThreshold = $lengthRatioThreshold;
3543
$this->commonTextRatioThreshold = $commonTextRatioThreshold;
@@ -63,20 +71,20 @@ public function isMatch($a, $b)
6371
// Check common prefix/ suffix length
6472
$aCleaned = trim($aStripped);
6573
$bCleaned = trim($bStripped);
66-
if (mb_strlen($aCleaned) === 0 || mb_strlen($bCleaned) === 0) {
74+
if ($this->stringUtil->strlen($aCleaned) === 0 || $this->stringUtil->strlen($bCleaned) === 0) {
6775
$aCleaned = $a;
6876
$bCleaned = $b;
6977
}
70-
if (mb_strlen($aCleaned) === 0 || mb_strlen($bCleaned) === 0) {
78+
if ($this->stringUtil->strlen($aCleaned) === 0 || $this->stringUtil->strlen($bCleaned) === 0) {
7179
return false;
7280
}
73-
$prefixIndex = Preprocessor::diffCommonPrefix($aCleaned, $bCleaned);
74-
$suffixIndex = Preprocessor::diffCommonSuffix($aCleaned, $bCleaned);
81+
$prefixIndex = Preprocessor::diffCommonPrefix($aCleaned, $bCleaned, $this->stringUtil);
82+
$suffixIndex = Preprocessor::diffCommonSuffix($aCleaned, $bCleaned, $this->stringUtil);
7583

7684
// Use shorter string, and see how much of it is leftover
77-
$len = min(mb_strlen($aCleaned), mb_strlen($bCleaned));
85+
$len = min($this->stringUtil->strlen($aCleaned), $this->stringUtil->strlen($bCleaned));
7886
$remaining = $len - ($prefixIndex + $suffixIndex);
79-
$strLengthPercent = $len / max(mb_strlen($a), mb_strlen($b));
87+
$strLengthPercent = $len / max($this->stringUtil->strlen($a), $this->stringUtil->strlen($b));
8088

8189
if ($remaining === 0 && $strLengthPercent > $this->lengthRatioThreshold) {
8290
return true;

0 commit comments

Comments
 (0)