@@ -44,6 +44,18 @@ module Language.LSP.VFS
4444 , persistFileVFS
4545 , closeVFS
4646
47+ -- * Positions and transformations
48+ , CodePointPosition (.. )
49+ , line
50+ , character
51+ , codePointPositionToPosition
52+ , positionToCodePointPosition
53+ , CodePointRange (.. )
54+ , start
55+ , end
56+ , codePointRangeToRange
57+ , rangeToCodePointRange
58+
4759 -- * manipulating the file contents
4860 , rangeLinesFromVfs
4961 , PosPrefixInfo (.. )
@@ -69,9 +81,10 @@ import Data.Ord
6981import qualified Data.HashMap.Strict as HashMap
7082import qualified Data.Map.Strict as Map
7183import Data.Maybe
84+ import qualified Data.Text.Rope as URope
7285import Data.Text.Utf16.Rope ( Rope )
7386import qualified Data.Text.Utf16.Rope as Rope
74- import Data.Text.Prettyprint.Doc
87+ import Data.Text.Prettyprint.Doc hiding ( line )
7588import qualified Language.LSP.Types as J
7689import qualified Language.LSP.Types.Lens as J
7790import System.FilePath
@@ -346,6 +359,133 @@ changeChars logger str start finish new = do
346359
347360-- ---------------------------------------------------------------------
348361
362+ -- | A position, like a 'J.Position', but where the offsets in the line are measured in
363+ -- Unicode code points instead of UTF-16 code units.
364+ data CodePointPosition =
365+ CodePointPosition
366+ { -- | Line position in a document (zero-based).
367+ _line :: J. UInt
368+ -- | Character offset on a line in a document in *code points* (zero-based).
369+ , _character :: J. UInt
370+ } deriving (Show , Read , Eq , Ord )
371+
372+ -- | A range, like a 'J.Range', but where the offsets in the line are measured in
373+ -- Unicode code points instead of UTF-16 code units.
374+ data CodePointRange =
375+ CodePointRange
376+ { _start :: CodePointPosition -- ^ The range's start position.
377+ , _end :: CodePointPosition -- ^ The range's end position.
378+ } deriving (Show , Read , Eq , Ord )
379+
380+ makeFieldsNoPrefix ''CodePointPosition
381+ makeFieldsNoPrefix ''CodePointRange
382+
383+ {- Note [Converting between code points and code units]
384+ This is inherently a somewhat expensive operation, but we take some care to minimize the cost.
385+ In particular, we use the good asymptotics of 'Rope' to our advantage:
386+ - We extract the single line that we are interested in in time logarithmic in the number of lines.
387+ - We then split the line at the given position, and check how long the prefix is, which takes
388+ linear time in the length of the (single) line.
389+
390+ We also may need to convert the line back and forth between ropes with different indexing. Again
391+ this is linear time in the length of the line.
392+
393+ So the overall process is logarithmic in the number of lines, and linear in the length of the specific
394+ line. Which is okay-ish, so long as we don't have very long lines.
395+ -}
396+
397+ -- | Extracts a specific line from a 'Rope.Rope'.
398+ -- Logarithmic in the number of lines.
399+ extractLine :: Rope. Rope -> Word -> Maybe Rope. Rope
400+ extractLine rope l = do
401+ -- Check for the line being out of bounds
402+ let lastLine = Rope. posLine $ Rope. lengthAsPosition rope
403+ guard $ l <= lastLine
404+
405+ let (_, suffix) = Rope. splitAtLine l rope
406+ (prefix, _) = Rope. splitAtLine 1 suffix
407+ pure prefix
408+
409+ -- | Translate a code-point offset into a code-unit offset.
410+ -- Linear in the length of the rope.
411+ codePointOffsetToCodeUnitOffset :: URope. Rope -> Word -> Maybe Word
412+ codePointOffsetToCodeUnitOffset rope offset = do
413+ -- Check for the position being out of bounds
414+ guard $ offset <= URope. length rope
415+ -- Split at the given position in *code points*
416+ let (prefix, _) = URope. splitAt offset rope
417+ -- Convert the prefix to a rope using *code units*
418+ utf16Prefix = Rope. fromText $ URope. toText prefix
419+ -- Get the length of the prefix in *code units*
420+ pure $ Rope. length utf16Prefix
421+
422+ -- | Translate a UTF-16 code-unit offset into a code-point offset.
423+ -- Linear in the length of the rope.
424+ codeUnitOffsetToCodePointOffset :: Rope. Rope -> Word -> Maybe Word
425+ codeUnitOffsetToCodePointOffset rope offset = do
426+ -- Check for the position being out of bounds
427+ guard $ offset <= Rope. length rope
428+ -- Split at the given position in *code units*
429+ (prefix, _) <- Rope. splitAt offset rope
430+ -- Convert the prefixto a rope using *code points*
431+ let utfPrefix = URope. fromText $ Rope. toText prefix
432+ -- Get the length of the prefix in *code points*
433+ pure $ URope. length utfPrefix
434+
435+ -- | Given a virtual file, translate a 'CodePointPosition' in that file into a 'J.Position' in that file.
436+ --
437+ -- Will return 'Nothing' if the requested position is out of bounds of the document.
438+ --
439+ -- Logarithmic in the number of lines in the document, and linear in the length of the line containing
440+ -- the position.
441+ codePointPositionToPosition :: VirtualFile -> CodePointPosition -> Maybe J. Position
442+ codePointPositionToPosition vFile (CodePointPosition l cpc) = do
443+ -- See Note [Converting between code points and code units]
444+ let text = _file_text vFile
445+ utf16Line <- extractLine text (fromIntegral l)
446+ -- Convert the line a rope using *code points*
447+ let utfLine = URope. fromText $ Rope. toText utf16Line
448+
449+ cuc <- codePointOffsetToCodeUnitOffset utfLine (fromIntegral cpc)
450+ pure $ J. Position l (fromIntegral cuc)
451+
452+ -- | Given a virtual file, translate a 'CodePointRange' in that file into a 'J.Range' in that file.
453+ --
454+ -- Will return 'Nothing' if any of the positions are out of bounds of the document.
455+ --
456+ -- Logarithmic in the number of lines in the document, and linear in the length of the lines containing
457+ -- the positions.
458+ codePointRangeToRange :: VirtualFile -> CodePointRange -> Maybe J. Range
459+ codePointRangeToRange vFile (CodePointRange b e) =
460+ J. Range <$> codePointPositionToPosition vFile b <*> codePointPositionToPosition vFile e
461+
462+ -- | Given a virtual file, translate a 'J.Position' in that file into a 'CodePointPosition' in that file.
463+ --
464+ -- Will return 'Nothing' if the requested position lies inside a code point, or if it is out of bounds of the document.
465+ --
466+ -- Logarithmic in the number of lines in the document, and linear in the length of the line containing
467+ -- the position.
468+ positionToCodePointPosition :: VirtualFile -> J. Position -> Maybe CodePointPosition
469+ positionToCodePointPosition vFile (J. Position l cuc) = do
470+ -- See Note [Converting between code points and code units]
471+ let text = _file_text vFile
472+ utf16Line <- extractLine text (fromIntegral l)
473+
474+ cpc <- codeUnitOffsetToCodePointOffset utf16Line (fromIntegral cuc)
475+ pure $ CodePointPosition l (fromIntegral cpc)
476+
477+ -- | Given a virtual file, translate a 'J.Range' in that file into a 'CodePointRange' in that file.
478+ --
479+ -- Will return 'Nothing' if any of the positions are out of bounds of the document.
480+ --
481+ -- Logarithmic in the number of lines in the document, and linear in the length of the lines containing
482+ -- the positions.
483+ rangeToCodePointRange :: VirtualFile -> J. Range -> Maybe CodePointRange
484+ rangeToCodePointRange vFile (J. Range b e) =
485+ CodePointRange <$> positionToCodePointPosition vFile b <*> positionToCodePointPosition vFile e
486+
487+ -- ---------------------------------------------------------------------
488+
349489-- TODO:AZ:move this to somewhere sane
350490-- | Describes the line at the current cursor position
351491data PosPrefixInfo = PosPrefixInfo
0 commit comments