From 4998e5662224797350fe8bd9908a19399fee72fb Mon Sep 17 00:00:00 2001 From: Jorge Leandro Perez Date: Fri, 8 Nov 2019 17:12:50 -0300 Subject: [PATCH 1/6] DiffMatchPatchCFUtilities: Accounting for surrogate pairs --- External/diffmatchpatch/DiffMatchPatchCFUtilities.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/External/diffmatchpatch/DiffMatchPatchCFUtilities.c b/External/diffmatchpatch/DiffMatchPatchCFUtilities.c index 588f3d78..ceb4d61e 100755 --- a/External/diffmatchpatch/DiffMatchPatchCFUtilities.c +++ b/External/diffmatchpatch/DiffMatchPatchCFUtilities.c @@ -112,6 +112,9 @@ CFIndex diff_commonPrefix(CFStringRef text1, CFStringRef text2) { char2 = CFStringGetCharacterFromInlineBuffer(&text2_inlineBuffer, i); if (char1 != char2) { + if ( CFStringIsSurrogateLowCharacter(char1) || CFStringIsSurrogateHighCharacter(char1) ) { + i = MAX(i - 1, 0); + } return i; } } @@ -142,7 +145,11 @@ CFIndex diff_commonSuffix(CFStringRef text1, CFStringRef text2) { char2 = CFStringGetCharacterFromInlineBuffer(&text2_inlineBuffer, (text2_length - i)); if (char1 != char2) { - return i - 1; + if ( CFStringIsSurrogateLowCharacter(char1) || CFStringIsSurrogateHighCharacter(char1) ) { + return MIN(i - 2, 0); + } + + return i - 1; } } return n; From a447f8487c58c919bd0e4a070e6a9b8f26778b80 Mon Sep 17 00:00:00 2001 From: Jorge Leandro Perez Date: Fri, 8 Nov 2019 17:13:09 -0300 Subject: [PATCH 2/6] DiffMatchPatchCFUtilities: cleanupSemanticScore considers surrogate pairs --- .../DiffMatchPatchCFUtilities.c | 36 ++++++++----------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/External/diffmatchpatch/DiffMatchPatchCFUtilities.c b/External/diffmatchpatch/DiffMatchPatchCFUtilities.c index ceb4d61e..4b468355 100755 --- a/External/diffmatchpatch/DiffMatchPatchCFUtilities.c +++ b/External/diffmatchpatch/DiffMatchPatchCFUtilities.c @@ -655,27 +655,19 @@ CFIndex diff_cleanupSemanticScore(CFStringRef one, CFStringRef two) { // 'whitespace'. Since this function's purpose is largely cosmetic, // the choice has been made to use each language's native features // rather than force total conformity. - UniChar char1 = - CFStringGetCharacterAtIndex(one, (CFStringGetLength(one) - 1)); - UniChar char2 = - CFStringGetCharacterAtIndex(two, 0); - Boolean nonAlphaNumeric1 = - !CFCharacterSetIsCharacterMember(alphaNumericSet, char1); - Boolean nonAlphaNumeric2 = - !CFCharacterSetIsCharacterMember(alphaNumericSet, char2); - Boolean whitespace1 = - nonAlphaNumeric1 && CFCharacterSetIsCharacterMember(whiteSpaceSet, char1); - Boolean whitespace2 = - nonAlphaNumeric2 && CFCharacterSetIsCharacterMember(whiteSpaceSet, char2); - Boolean lineBreak1 = - whitespace1 && CFCharacterSetIsCharacterMember(controlSet, char1); - Boolean lineBreak2 = - whitespace2 && CFCharacterSetIsCharacterMember(controlSet, char2); - Boolean blankLine1 = - lineBreak1 && diff_regExMatch(one, &blankLineEndRegEx); - Boolean blankLine2 = - lineBreak2 && diff_regExMatch(two, &blankLineStartRegEx); - + UniChar char1 = CFStringGetCharacterAtIndex(one, (CFStringGetLength(one) - 1)); + UniChar char2 = CFStringGetCharacterAtIndex(two, 0); + Boolean char1IsSurrogate = CFStringIsSurrogateLowCharacter(char1) || CFStringIsSurrogateHighCharacter(char1); + Boolean char2IsSurrogate = CFStringIsSurrogateLowCharacter(char2) || CFStringIsSurrogateHighCharacter(char1); + Boolean nonAlphaNumeric1 = !CFCharacterSetIsCharacterMember(alphaNumericSet, char1); + Boolean nonAlphaNumeric2 = !CFCharacterSetIsCharacterMember(alphaNumericSet, char2); + Boolean whitespace1 = nonAlphaNumeric1 && CFCharacterSetIsCharacterMember(whiteSpaceSet, char1); + Boolean whitespace2 = nonAlphaNumeric2 && CFCharacterSetIsCharacterMember(whiteSpaceSet, char2); + Boolean lineBreak1 = whitespace1 && CFCharacterSetIsCharacterMember(controlSet, char1); + Boolean lineBreak2 = whitespace2 && CFCharacterSetIsCharacterMember(controlSet, char2); + Boolean blankLine1 = lineBreak1 && diff_regExMatch(one, &blankLineEndRegEx); + Boolean blankLine2 = lineBreak2 && diff_regExMatch(two, &blankLineStartRegEx); + if (blankLine1 || blankLine2) { // Five points for blank lines. return 5; @@ -688,7 +680,7 @@ CFIndex diff_cleanupSemanticScore(CFStringRef one, CFStringRef two) { } else if (whitespace1 || whitespace2) { // Two points for whitespace. return 2; - } else if (nonAlphaNumeric1 || nonAlphaNumeric2) { + } else if (nonAlphaNumeric1 && !char1IsSurrogate || nonAlphaNumeric2 && !char1IsSurrogate) { // One point for non-alphanumeric. return 1; } From 49684978411b81d45abd929de39f5cc9a23619f6 Mon Sep 17 00:00:00 2001 From: Jorge Leandro Perez Date: Fri, 8 Nov 2019 17:24:48 -0300 Subject: [PATCH 3/6] DiffMatchPatchCFUtilities: Reverting to impossible to read alignment --- .../DiffMatchPatchCFUtilities.c | 36 ++++++++++++------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/External/diffmatchpatch/DiffMatchPatchCFUtilities.c b/External/diffmatchpatch/DiffMatchPatchCFUtilities.c index 4b468355..c16e58ce 100755 --- a/External/diffmatchpatch/DiffMatchPatchCFUtilities.c +++ b/External/diffmatchpatch/DiffMatchPatchCFUtilities.c @@ -655,18 +655,30 @@ CFIndex diff_cleanupSemanticScore(CFStringRef one, CFStringRef two) { // 'whitespace'. Since this function's purpose is largely cosmetic, // the choice has been made to use each language's native features // rather than force total conformity. - UniChar char1 = CFStringGetCharacterAtIndex(one, (CFStringGetLength(one) - 1)); - UniChar char2 = CFStringGetCharacterAtIndex(two, 0); - Boolean char1IsSurrogate = CFStringIsSurrogateLowCharacter(char1) || CFStringIsSurrogateHighCharacter(char1); - Boolean char2IsSurrogate = CFStringIsSurrogateLowCharacter(char2) || CFStringIsSurrogateHighCharacter(char1); - Boolean nonAlphaNumeric1 = !CFCharacterSetIsCharacterMember(alphaNumericSet, char1); - Boolean nonAlphaNumeric2 = !CFCharacterSetIsCharacterMember(alphaNumericSet, char2); - Boolean whitespace1 = nonAlphaNumeric1 && CFCharacterSetIsCharacterMember(whiteSpaceSet, char1); - Boolean whitespace2 = nonAlphaNumeric2 && CFCharacterSetIsCharacterMember(whiteSpaceSet, char2); - Boolean lineBreak1 = whitespace1 && CFCharacterSetIsCharacterMember(controlSet, char1); - Boolean lineBreak2 = whitespace2 && CFCharacterSetIsCharacterMember(controlSet, char2); - Boolean blankLine1 = lineBreak1 && diff_regExMatch(one, &blankLineEndRegEx); - Boolean blankLine2 = lineBreak2 && diff_regExMatch(two, &blankLineStartRegEx); + UniChar char1 = + CFStringGetCharacterAtIndex(one, (CFStringGetLength(one) - 1)); + UniChar char2 = + CFStringGetCharacterAtIndex(two, 0); + Boolean char1IsSurrogate = + CFStringIsSurrogateLowCharacter(char1) || CFStringIsSurrogateHighCharacter(char1); + Boolean char2IsSurrogate = + CFStringIsSurrogateLowCharacter(char2) || CFStringIsSurrogateHighCharacter(char1); + Boolean nonAlphaNumeric1 = + !CFCharacterSetIsCharacterMember(alphaNumericSet, char1); + Boolean nonAlphaNumeric2 = + !CFCharacterSetIsCharacterMember(alphaNumericSet, char2); + Boolean whitespace1 = + nonAlphaNumeric1 && CFCharacterSetIsCharacterMember(whiteSpaceSet, char1); + Boolean whitespace2 = + nonAlphaNumeric2 && CFCharacterSetIsCharacterMember(whiteSpaceSet, char2); + Boolean lineBreak1 = + whitespace1 && CFCharacterSetIsCharacterMember(controlSet, char1); + Boolean lineBreak2 = + whitespace2 && CFCharacterSetIsCharacterMember(controlSet, char2); + Boolean blankLine1 = + lineBreak1 && diff_regExMatch(one, &blankLineEndRegEx); + Boolean blankLine2 = + lineBreak2 && diff_regExMatch(two, &blankLineStartRegEx); if (blankLine1 || blankLine2) { // Five points for blank lines. From 6f32ca5ea0822811efef38f65ad025425bb8ba55 Mon Sep 17 00:00:00 2001 From: Jorge Leandro Perez Date: Fri, 8 Nov 2019 17:28:52 -0300 Subject: [PATCH 4/6] DiffMatchPatchCFUtilities: Fixes warning --- External/diffmatchpatch/DiffMatchPatchCFUtilities.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/External/diffmatchpatch/DiffMatchPatchCFUtilities.c b/External/diffmatchpatch/DiffMatchPatchCFUtilities.c index c16e58ce..d4515b72 100755 --- a/External/diffmatchpatch/DiffMatchPatchCFUtilities.c +++ b/External/diffmatchpatch/DiffMatchPatchCFUtilities.c @@ -692,7 +692,7 @@ CFIndex diff_cleanupSemanticScore(CFStringRef one, CFStringRef two) { } else if (whitespace1 || whitespace2) { // Two points for whitespace. return 2; - } else if (nonAlphaNumeric1 && !char1IsSurrogate || nonAlphaNumeric2 && !char1IsSurrogate) { + } else if ((nonAlphaNumeric1 && !char1IsSurrogate) || (nonAlphaNumeric2 && !char2IsSurrogate)) { // One point for non-alphanumeric. return 1; } From 8fa87b61446def0232c8d7e93b88f1cc10e5d3f3 Mon Sep 17 00:00:00 2001 From: Jorge Leandro Perez Date: Fri, 8 Nov 2019 17:35:41 -0300 Subject: [PATCH 5/6] DiffMatchPatchTest: New Unit Tests --- SimperiumTests/DiffMatchPatchTest.m | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/SimperiumTests/DiffMatchPatchTest.m b/SimperiumTests/DiffMatchPatchTest.m index fbc325c2..240ecb49 100755 --- a/SimperiumTests/DiffMatchPatchTest.m +++ b/SimperiumTests/DiffMatchPatchTest.m @@ -48,6 +48,17 @@ - (void)testDiffCommonPrefixTest { XCTAssertEqual((NSUInteger)4, [dmp diff_commonPrefixOfFirstString:@"1234" andSecondString:@"1234xyz"], @"Common suffix whole case failed."); } +- (void)testDiffCommonPrefixDoesntSplitSurrogatePairs { + DiffMatchPatch *dmp = [DiffMatchPatch new]; + + NSString *pristine = @"β˜ΊοΈπŸ––πŸΏ"; + NSString *edited = @"β˜ΊοΈπŸ˜ƒπŸ––πŸΏ"; + NSString *common = @"☺️"; + + NSUInteger prefix = [dmp diff_commonPrefixOfFirstString:pristine andSecondString:edited]; + XCTAssertEqual(prefix, common.length, @"Common Prefix should match the common emoji's length"); +} + - (void)testDiffCommonSuffixTest { DiffMatchPatch *dmp = [DiffMatchPatch new]; @@ -62,6 +73,19 @@ - (void)testDiffCommonSuffixTest { XCTAssertEqual((NSUInteger)4, [dmp diff_commonSuffixOfFirstString:@"1234" andSecondString:@"xyz1234"], @"Detect any common suffix. Whole case."); } +- (void)testDiffCommonSuffixDoesntSplitSurrogatePairs { + DiffMatchPatch *dmp = [DiffMatchPatch new]; + + NSString *pristine = @"β˜ΊοΈπŸ––πŸΏ"; + NSString *edited = @"β˜ΊοΈπŸ˜ƒπŸ––πŸΏ"; + NSString *expected = @"πŸ––πŸΏ"; + + NSUInteger suffix = [dmp diff_commonSuffixOfFirstString:pristine andSecondString:edited]; + NSString *common = [pristine substringFromIndex:pristine.length - suffix]; + + XCTAssertEqualObjects(expected, common, @"Common Suffix should match the last emoji"); +} + - (void)testDiffCommonOverlapTest { DiffMatchPatch *dmp = [DiffMatchPatch new]; From bc96dd3548abe42a04b02c6526e927201d6a5230 Mon Sep 17 00:00:00 2001 From: Jorge Leandro Perez Date: Fri, 8 Nov 2019 17:42:48 -0300 Subject: [PATCH 6/6] DiffMatchPatchTest: New Unit Test --- SimperiumTests/DiffMatchPatchTest.m | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/SimperiumTests/DiffMatchPatchTest.m b/SimperiumTests/DiffMatchPatchTest.m index 240ecb49..abf42a2f 100755 --- a/SimperiumTests/DiffMatchPatchTest.m +++ b/SimperiumTests/DiffMatchPatchTest.m @@ -412,6 +412,21 @@ - (void)testDiffCleanupSemanticLosslessTest { XCTAssertEqualObjects(expectedResult, diffs, @"Sentence boundaries."); } +- (void)testDiffCleanupSemanticLosslessDoesNotTamperWithSurrogatePairsTest { + NSArray *expected = @[ + [Diff diffWithOperation:DIFF_EQUAL andText:@"☺️"], + [Diff diffWithOperation:DIFF_INSERT andText:@"πŸ˜ƒ"], + [Diff diffWithOperation:DIFF_EQUAL andText:@"πŸ––πŸΏ"] + ]; + + NSMutableArray *diffs = [expected mutableCopy]; + + DiffMatchPatch *dmp = [DiffMatchPatch new]; + [dmp diff_cleanupSemanticLossless:diffs]; + + XCTAssertEqualObjects(diffs, expected, @"The result should match the input!"); +} + - (void)testDiffCleanupSemanticTest { DiffMatchPatch *dmp = [DiffMatchPatch new]; NSMutableArray *expectedResult = nil;