diff --git a/CHANGELOG b/CHANGELOG index 933804b64c..2846d0aaaf 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -16,6 +16,7 @@ git-annex (8.20211029) UNRELEASED; urgency=medium * migrate: New --remove-size option. * Fix a typo in the name of youtube-dl (reversion introduced in version 8.20210903) + * importfeed: Fix a crash when used in a non-unicode locale. -- Joey Hess Mon, 01 Nov 2021 13:19:46 -0400 diff --git a/Command/ImportFeed.hs b/Command/ImportFeed.hs index a8d21365e3..922e354c21 100644 --- a/Command/ImportFeed.hs +++ b/Command/ImportFeed.hs @@ -20,7 +20,9 @@ import Data.Time.Format import Data.Time.Calendar import Data.Time.LocalTime import qualified Data.Text as T +import qualified Data.Text.Encoding as TE import qualified System.FilePath.ByteString as P +import qualified Data.ByteString as B import Command import qualified Annex @@ -161,10 +163,10 @@ findDownloads u f = catMaybes $ map mk (feedItems f) mk i = case getItemEnclosure i of Just (enclosureurl, _, _) -> Just $ ToDownload f u i $ Enclosure $ - T.unpack enclosureurl + decodeBS $ fromFeedText enclosureurl Nothing -> case getItemLink i of Just l -> Just $ ToDownload f u i $ - MediaLink $ T.unpack l + MediaLink $ decodeBS $ fromFeedText l Nothing -> Nothing {- Feeds change, so a feed download cannot be resumed. -} @@ -243,7 +245,7 @@ performDownload addunlockedmatcher opts cache todownload = case location todownl knownitemid = case getItemId (item todownload) of Just (_, itemid) -> - S.member (T.unpack itemid) (knownitems cache) + S.member (decodeBS $ fromFeedText itemid) (knownitems cache) _ -> False rundownload url extension getter = do @@ -370,7 +372,7 @@ feedFile tmpl i extension = sanitizeLeadingFilePathCharacter $ Just pd -> Just $ formatTime defaultTimeLocale "%F" pd -- if date cannot be parsed, use the raw string - Nothing-> replace "/" "-" . T.unpack + Nothing-> replace "/" "-" . decodeBS . fromFeedText <$> getItemPublishDateString itm (itempubyear, itempubmonth, itempubday) = case pubdate of @@ -401,7 +403,7 @@ minimalMetaData :: ToDownload -> MetaData minimalMetaData i = case getItemId (item i) of (Nothing) -> emptyMetaData (Just (_, itemid)) -> MetaData $ M.singleton itemIdField - (S.singleton $ toMetaValue $ encodeBS $ T.unpack itemid) + (S.singleton $ toMetaValue $fromFeedText itemid) {- Extract fields from the feed and item, that are both used as metadata, - and to generate the filename. -} @@ -411,18 +413,18 @@ extractFields i = map (uncurry extractField) , ("itemtitle", [itemtitle]) , ("feedauthor", [feedauthor]) , ("itemauthor", [itemauthor]) - , ("itemsummary", [T.unpack <$> getItemSummary (item i)]) - , ("itemdescription", [T.unpack <$> getItemDescription (item i)]) - , ("itemrights", [T.unpack <$> getItemRights (item i)]) - , ("itemid", [T.unpack . snd <$> getItemId (item i)]) + , ("itemsummary", [decodeBS . fromFeedText <$> getItemSummary (item i)]) + , ("itemdescription", [decodeBS . fromFeedText <$> getItemDescription (item i)]) + , ("itemrights", [decodeBS . fromFeedText <$> getItemRights (item i)]) + , ("itemid", [decodeBS . fromFeedText . snd <$> getItemId (item i)]) , ("title", [itemtitle, feedtitle]) , ("author", [itemauthor, feedauthor]) ] where - feedtitle = Just $ T.unpack $ getFeedTitle $ feed i - itemtitle = T.unpack <$> getItemTitle (item i) - feedauthor = T.unpack <$> getFeedAuthor (feed i) - itemauthor = T.unpack <$> getItemAuthor (item i) + feedtitle = Just $ decodeBS $ fromFeedText $ getFeedTitle $ feed i + itemtitle = decodeBS . fromFeedText <$> getItemTitle (item i) + feedauthor = decodeBS . fromFeedText <$> getFeedAuthor (feed i) + itemauthor = decodeBS . fromFeedText <$> getItemAuthor (item i) itemIdField :: MetaField itemIdField = mkMetaFieldUnchecked "itemid" @@ -478,3 +480,17 @@ clearFeedProblem url = feedState :: URLString -> Annex RawFilePath feedState url = fromRepo $ gitAnnexFeedState $ fromUrl url Nothing + +{- The feed library parses the feed to Text, and does not use the + - filesystem encoding to do it, so when the locale is not unicode + - capable, a Text value can still include unicode characters. + - + - So, it's not safe to use T.unpack to convert that to a String, + - because later use of that String by eg encodeBS will crash + - with an encoding error. Use this instad. + - + - This should not be used on a Text that is read using the + - filesystem encoding because it does not reverse that encoding. + -} +fromFeedText :: T.Text -> B.ByteString +fromFeedText = TE.encodeUtf8 diff --git a/doc/bugs/importfeed_failing_with_hPut__58___invalid_argument.mdwn b/doc/bugs/importfeed_failing_with_hPut__58___invalid_argument.mdwn index e7f7d445ee..007ad9b806 100644 --- a/doc/bugs/importfeed_failing_with_hPut__58___invalid_argument.mdwn +++ b/doc/bugs/importfeed_failing_with_hPut__58___invalid_argument.mdwn @@ -69,3 +69,5 @@ $ echo $? ### Have you had any luck using git-annex before? (Sometimes we get tired of reading bug reports all day and a lil' positive end note does wonders) I've been using git annex for a few years now primarily as a podcatcher. I've also been using it to manage my ebooks and devices. I'm slowly starting to use it for managing my personal (non-text) records and documents. + +> [[fixed|done]] --[[Joey]] diff --git a/doc/bugs/importfeed_failing_with_hPut__58___invalid_argument/comment_1_a36feb24e627bba9c146fc791fca2a6b._comment b/doc/bugs/importfeed_failing_with_hPut__58___invalid_argument/comment_1_a36feb24e627bba9c146fc791fca2a6b._comment new file mode 100644 index 0000000000..673c8ec88e --- /dev/null +++ b/doc/bugs/importfeed_failing_with_hPut__58___invalid_argument/comment_1_a36feb24e627bba9c146fc791fca2a6b._comment @@ -0,0 +1,60 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 1""" + date="2021-11-15T16:04:31Z" + content=""" +To reproduce this, I had to set LANG=C. Using a unicode locale avoids +the problem. + +The `.met` file indicates it's a problem with encoding of metadata +that is being imported from the feed, and so it must be the +itemid that is causing the problem. + + + ... + http://www.manager-tools.com/2014/01/choosing-a-company-work-chapter-7-–-questions/ + + +A file with the feed edited down to just that item is enough to reproduce it. +Notice the unicode in the guid "chapter-7-–-questions". +That ENDASH character is causing the crash. + +Also I noticed that the next time it runs, it skips the item, since it got +far enough to add the file for it and record the url before the metadata itemid +write crashed it. Explains why it's failing on different items in different runs. + +While this looks like one of the old Handle output encoding problems, it is not, +because a) the itemid is written as a ByteString so encoding does not matter, +b) those were fixed comprehensively by forcing all handles to use filesystem +enconding, and c) just printing out the length of the itemid also causes a crash: + + + liftIO $ print (L.length (journalableByteString content)) + + git-annex: recoverEncode: invalid argument (invalid character) + +Looking at what the feed library parses: + + LANG=C ghci Utility/FileSystemEncoding.hs + ghci> Just f <- Text.Feed.Import.parseFeedFromFile "career_tools_podcasts.xml" + ghci> Just (_, x) = Text.Feed.Query.getItemId (Text.Feed.Query.feedItems f !! 0) + ghci> x + "http://www.manager-tools.com/2014/01/choosing-a-company-work-chapter-7-\8211-questions/" + ghci> encodeBS (Data.Text.unpack x) + "*** Exception: recoverEncode: invalid argument (invalid character) + +So the problem is that Text parses the feed as unicode, leading to this +non-ascii Char that is not encoded using the filesystem encoding +(which would encode it as "\56546\56448\56467"). +And `encodeBS "\8211"` crashes in LANG=C. + +Which is a reversion of sorts; before [[!commit fa62c98910]] encodeBS did +not crash. Although it also didn't round-trip this value properly, +producing "M" for it. Since this only affects strings that are not input in +the filesystem encoding, I think the new encodeBS is still ok to use +generally; I'm not going to revert that commit. + +Instead, Text values originating from Feed need to be converted to +String in some other way, producing a value encoded +using the filesystem encoding. encodeUtf8 looks like it will +do the right thing in this case. +"""]]