add PIRegularExpression

2025-08-11 14:23:29 +03:00
parent 91955d44fa
commit 654c0847b2
481 changed files with 434858 additions and 0 deletions
@@ -0,0 +1,163 @@
+# These test special UTF and UCP features of DFA matching. The output is
+# different for the different widths.
+
+#subject dfa
+
+# ---------------------------------------------------- 
+# These are a selection of the more comprehensive tests that are run for
+# non-DFA matching.
+
+/X/utf
+    XX\x{d800}
+Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
+    XX\x{d800}\=offset=3
+No match
+    XX\x{d800}\=no_utf_check
+ 0: X
+    XX\x{da00}
+Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
+    XX\x{da00}\=no_utf_check
+ 0: X
+    XX\x{dc00}
+Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
+    XX\x{dc00}\=no_utf_check
+ 0: X
+    XX\x{de00}
+Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
+    XX\x{de00}\=no_utf_check
+ 0: X
+    XX\x{dfff}
+Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
+    XX\x{dfff}\=no_utf_check
+ 0: X
+    XX\x{110000}
+** Failed: character \N{U+110000} is greater than 0x10ffff and therefore cannot be encoded as UTF-16
+    XX\x{d800}\x{1234}
+Failed: error -25: UTF-16 error: invalid low surrogate at offset 2
+          
+/badutf/utf
+    X\xdf
+No match
+    XX\xef
+No match
+    XXX\xef\x80
+No match
+    X\xf7
+No match
+    XX\xf7\x80
+No match
+    XXX\xf7\x80\x80
+No match
+
+/shortutf/utf
+    XX\xdf\=ph
+No match
+    XX\xef\=ph
+No match
+    XX\xef\x80\=ph
+No match
+    \xf7\=ph
+No match
+    \xf7\x80\=ph
+No match
+    
+# ---------------------------------------------------- 
+# UCP and casing tests - except for the first two, these will all fail in 8-bit
+# mode because they are testing UCP without UTF and use characters > 255.
+
+/\x{c1}/i,no_start_optimize
+\= Expect no match
+    \x{e1}
+No match
+
+/\x{c1}+\x{e1}/iB,ucp
+------------------------------------------------------------------
+        Bra
+     /i \x{c1}+
+     /i \x{e1}
+        Ket
+        End
+------------------------------------------------------------------
+    \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+ 1: \xc1\xc1
+    \x{e1}\x{e1}\x{e1} 
+ 0: \xe1\xe1\xe1
+ 1: \xe1\xe1
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+    \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/\x{120}\x{c1}/i,ucp
+    \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/[^\x{120}]/i,no_start_optimize
+    \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+\= Expect no match
+    \x{121}
+No match
+
+/[^\x{120}]/i
+    \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp
+\= Expect no match
+    \x{121}
+No match
+    
+/\x{120}{2}/i,ucp
+    \x{121}\x{121}
+ 0: \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+\= Expect no match
+    \x{121}\x{121}
+No match
+
+# ---------------------------------------------------- 
+
+# ---------------------------------------------------- 
+# Tests for handling 0xffffffff in caseless UCP mode. They only apply to 32-bit
+# mode; for the other widths they will fail.
+
+/k*\x{ffffffff}/caseless,ucp
+Failed: error 134 at offset 13: character code point value in \x{} or \o{} is too large
+    \x{ffffffff}
+
+/k+\x{ffffffff}/caseless,ucp,no_start_optimize
+Failed: error 134 at offset 13: character code point value in \x{} or \o{} is too large
+    K\x{ffffffff}
+\= Expect no match     
+    \x{ffffffff}\x{ffffffff}
+
+/k{2}\x{ffffffff}/caseless,ucp,no_start_optimize
+Failed: error 134 at offset 15: character code point value in \x{} or \o{} is too large
+\= Expect no match
+    \x{ffffffff}\x{ffffffff}\x{ffffffff}
+
+/k\x{ffffffff}/caseless,ucp,no_start_optimize
+Failed: error 134 at offset 12: character code point value in \x{} or \o{} is too large
+    K\x{ffffffff}
+\= Expect no match
+    \x{ffffffff}\x{ffffffff}\x{ffffffff}
+
+/k{2,}?Z/caseless,ucp,no_start_optimize,no_auto_possess
+\= Expect no match
+    Kk\x{ffffffff}\x{ffffffff}\x{ffffffff}Z
+** Character \x{ffffffff} is greater than 0xffff and UTF-16 mode is not enabled.
+** Truncation will probably give the wrong result.
+** Character \x{ffffffff} is greater than 0xffff and UTF-16 mode is not enabled.
+** Truncation will probably give the wrong result.
+** Character \x{ffffffff} is greater than 0xffff and UTF-16 mode is not enabled.
+** Truncation will probably give the wrong result.
+No match
+
+# ---------------------------------------------------- 
+
+# End of testinput14