diff --git a/Logs/Presence/Pure.hs b/Logs/Presence/Pure.hs index 877ef31449..8e5722fa42 100644 --- a/Logs/Presence/Pure.hs +++ b/Logs/Presence/Pure.hs @@ -19,6 +19,7 @@ import qualified Data.ByteString.Char8 as C8 import qualified Data.Attoparsec.ByteString.Lazy as A import Data.Attoparsec.ByteString.Char8 (char, anyChar) import Data.ByteString.Builder +import Data.Char newtype LogInfo = LogInfo { fromLogInfo :: S.ByteString } deriving (Show, Eq, Ord) @@ -119,8 +120,14 @@ instance Arbitrary LogLine where <*> elements [minBound..maxBound] <*> (LogInfo <$> arbinfo) where - arbinfo = (encodeBS <$> arbitrary) `suchThat` - (\b -> C8.notElem '\n' b && C8.notElem '\r' b) + -- Avoid newline characters, which cannot appear in + -- LogInfo. + -- + -- Avoid non-ascii values because fully arbitrary + -- strings may not be encoded using the filesystem + -- encoding, which is normally applied to all input. + arbinfo = (encodeBS <$> arbitrary `suchThat` all isAscii) + `suchThat` (\b -> C8.notElem '\n' b && C8.notElem '\r' b) prop_parse_build_presence_log :: [LogLine] -> Bool prop_parse_build_presence_log l = diff --git a/doc/bugs/prop__95__parse__95__build__95__presence__95__log_started_to_fail.mdwn b/doc/bugs/prop__95__parse__95__build__95__presence__95__log_started_to_fail.mdwn index 970eed7a70..3227f8dca7 100644 --- a/doc/bugs/prop__95__parse__95__build__95__presence__95__log_started_to_fail.mdwn +++ b/doc/bugs/prop__95__parse__95__build__95__presence__95__log_started_to_fail.mdwn @@ -18,3 +18,4 @@ cron-20210811/build-ubuntu.yaml-374-69466103-success/1_test-annex-more.txt:2021- cron-20210812/build-ubuntu.yaml-375-69466103-failed/1_test-annex-more.txt:2021-08-12T02:48:07.7317675Z name: git-annex-debianstandalone-packages_8.20210803+git45-g6318c0f27_amd64 ``` +> [[fixed|done]] --[[Joey]] diff --git a/doc/bugs/prop__95__parse__95__build__95__presence__95__log_started_to_fail/comment_1_32c4300e35453177613bfe1af50e6958._comment b/doc/bugs/prop__95__parse__95__build__95__presence__95__log_started_to_fail/comment_1_32c4300e35453177613bfe1af50e6958._comment new file mode 100644 index 0000000000..ec68f8fa97 --- /dev/null +++ b/doc/bugs/prop__95__parse__95__build__95__presence__95__log_started_to_fail/comment_1_32c4300e35453177613bfe1af50e6958._comment @@ -0,0 +1,46 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 1""" + date="2021-08-18T20:38:06Z" + content=""" +The test suite output includes the random seed that allows reproducing +the failure. + + 2021-08-12T02:49:23.2932499Z prop_parse_build_presence_log: FAIL + 2021-08-12T02:49:23.2934007Z *** Failed! Exception: 'recoverEncode: invalid argument (invalid character)' (after 3 tests): + 2021-08-12T02:49:23.2935639Z Exception thrown while showing test case: 'recoverEncode: invalid argument (invalid character)' + 2021-08-12T02:49:23.2937094Z Use --quickcheck-replay=271417 to reproduce. + +And git-annex test has to be run with LANG=C for it to fail. + +[[!commit fa62c98910746c2c5dda21b3f80effc147a04f65]] is responsible for this +failure popping up, somehow. I tried reverting that commit, and it fixed +the failure. + +But I don't understand why that commit would cause this problem. +Also reverting it is not sufficient, because filepath-bytestring includes +the same change, in its commit 7e88eb5726d8183987455e15d921dd4c5df94674. +So if the new code is buggy, RawFilePath conversions could also trigger +similar problems. Also, these changes were a 2x speedup over the old code. + +A similar problem was discussed and fixed long ago +in [[!commit 4e4e11849a0d95389de81461ba2f2a4e0245d3b2]]: + + an Arbitrary String is not necessarily encoded using the filesystem + encoding, and in a non-utf8 locale, encodeBS throws an exception on such a + string. All I could think to do is limit test data to ascii. + + This shouldn't be a problem in practice, because the all Strings in + git-annex that are not generated by Arbitrary should be loaded in a way + that does apply the filesystem encoding. + +That makes sense. And it is also encodeBS failing here, in a similar +situation. But why would the recent change to the implementation of +encodeBS make it fail, in this case where the old implementation does not? + +So I've fixed this by similarly limiting the test to ascii, but I'm unsatisfied +that the encodeBS implementation change might not break something else. I did +try, with LANG=C, using git-annex in a repo with some filenames that were +encoded in unicode, and it round-tripped them fine still, so the change seems +ok to that extent. +"""]]