@@ -519,6 +519,50 @@ TEST_F(LexerTest, RestoreStopAtCodeCompletion) {
519519 ASSERT_EQ (tok::eof, Tok.getKind ());
520520}
521521
522+ TEST_F (LexerTest, CharactersContainTheEdgeContinuationByte) {
523+ // A continuation byte must be in the range greater than or
524+ // equal to 0x80 and less than or equal to 0xBF
525+
526+ // À(0xC3 0x80), 㗀(0xE3 0x97 0x80), 🀀(0xF0 0x9F 0x80 0x80),
527+ // ÿ(0xC3 0xBF), 俿(0xE4 0xBF 0xBF), 𐐿(0xF0 0x90 0x90 0xBF)
528+ const char *Source = " À 㗀 🀀 ÿ 俿 𐐿" ;
529+
530+ LangOptions LangOpts;
531+ SourceManager SourceMgr;
532+ unsigned BufferID = SourceMgr.addMemBufferCopy (Source);
533+
534+ Lexer L (LangOpts, SourceMgr, BufferID, /* Diags=*/ nullptr , LexerMode::Swift);
535+
536+ Token Tok;
537+
538+ L.lex (Tok);
539+ ASSERT_EQ (tok::identifier, Tok.getKind ());
540+ ASSERT_EQ (" À" , Tok.getText ());
541+
542+ L.lex (Tok);
543+ ASSERT_EQ (tok::identifier, Tok.getKind ());
544+ ASSERT_EQ (" 㗀" , Tok.getText ());
545+
546+ L.lex (Tok);
547+ ASSERT_EQ (tok::identifier, Tok.getKind ());
548+ ASSERT_EQ (" 🀀" , Tok.getText ());
549+
550+ L.lex (Tok);
551+ ASSERT_EQ (tok::identifier, Tok.getKind ());
552+ ASSERT_EQ (" ÿ" , Tok.getText ());
553+
554+ L.lex (Tok);
555+ ASSERT_EQ (tok::identifier, Tok.getKind ());
556+ ASSERT_EQ (" 俿" , Tok.getText ());
557+
558+ L.lex (Tok);
559+ ASSERT_EQ (tok::identifier, Tok.getKind ());
560+ ASSERT_EQ (" 𐐿" , Tok.getText ());
561+
562+ L.lex (Tok);
563+ ASSERT_EQ (tok::eof, Tok.getKind ());
564+ }
565+
522566TEST_F (LexerTest, getLocForStartOfToken) {
523567 const char *Source = " aaa \n \t bbb \" hello\" \" -\\ (val)-\" " ;
524568
@@ -710,6 +754,29 @@ TEST_F(LexerTest, DiagnoseEmbeddedNulOffset) {
710754 DiagConsumer.messages , " 1, 4: nul character embedded in middle of file" ));
711755}
712756
757+ TEST_F (LexerTest, InvalidUTF8Bytes) {
758+ const char *Source = " \x80 " ;
759+
760+ LangOptions LangOpts;
761+ SourceManager SourceMgr;
762+ unsigned BufferID = SourceMgr.addMemBufferCopy (Source);
763+
764+ StringCaptureDiagnosticConsumer DiagConsumer;
765+ DiagnosticEngine Diags (SourceMgr);
766+ Diags.addConsumer (DiagConsumer);
767+
768+ Lexer L (LangOpts, SourceMgr, BufferID, &Diags, LexerMode::Swift);
769+
770+ Token Tok;
771+
772+ L.lex (Tok);
773+
774+ ASSERT_EQ (DiagConsumer.messages .size (), 1 );
775+ auto message = DiagConsumer.messages .front ();
776+ ASSERT_TRUE (message.find (" invalid UTF-8 found in source file" ) !=
777+ std::string::npos);
778+ }
779+
713780#if HAS_MMAP
714781
715782// This test requires mmap because llvm::sys::Memory doesn't support protecting
0 commit comments