An introduction to regular expressions in modern C ++

Hi, Habr. We invite future students of the "C ++ Developer. Professional" course to sign up for an open lesson on the topic "Backend in modern C ++"







In the meantime, let's share the traditional translation of useful material.










(Regular expressions , , regex — ) — C++. . - , — 20-30% . , ( “learn once, write anywhere”).





/!\: . , .





. , , . , . , C++. , .





: C++ ("flavours") , (, , ) ECMAScript.





, . , 24- (.. :), .





\b([01]?[0-9]|2[0-3]):([0-5]\d)\b
      
      



! ?





, , , 100% . . , , .





(↓), , 2-3 , , , . , , , .





, . . . (, ) , , .





https://regexone.com. . , , (<= , ) . .





, , :





  1. regextutorials.com





  2. hackerrank





std::regex std::regexerror

int main() {
    try {
        static const auto r = std::regex(R"(\)"); // Escape sequence error
    } catch (const std::regex_error &e) {
        assert(strcmp(e.what(), "Unexpected end of regex when escaping.") == 0);
        assert(e.code() == std::regex_constants::error_escape);
    }
    return EXIT_SUCCESS;
}
      
      



! . , (\) escape-.





std::regex



( ), ( allocator aware). , std::regex



(. C++ Weekly - Ep 74 - std::regex optimize by Jason Turner). , -, , , — std::regex::markcount()



, .





, , (, std::regexerror)



, .





std::regex_search

int main() {
    const string input = "ABC:1->   PQR:2;;;   XYZ:3<<<"s;
    const regex r(R"((\w+):(\w+);)");
    smatch m;

    if (regex_search(input, m, r)) {
        assert(m.size() == 3);
        assert(m[0].str() == "PQR:2;");                // Entire match
        assert(m[1].str() == "PQR");                   // Substring that matches 1st group
        assert(m[2].str() == "2");                     // Substring that matches 2nd group
        assert(m.prefix().str() == "ABC:1->   ");      // All before 1st character match
        assert(m.suffix().str() == ";;   XYZ:3<<<");   // All after last character match

        // for (string &&str : m) { // Alternatively. You can also do
        //     cout << str << endl;
        // }
    }
    return EXIT_SUCCESS;
}
      
      



smatch



std::match_results, ().





std::regex_match

, , - . std::regexmatch



.





bool is_valid_email_id(string_view str) {
    static const regex r(R"(\w+@\w+\.(?:com|in))");
    return regex_match(str.data(), r);
}

int main() {
    assert(is_valid_email_id("vishalchovatiya@ymail.com") == true);
    assert(is_valid_email_id("@abc.com") == false);
    return EXIT_SUCCESS;
}
      
      



return EXIT¨C14Cmatch



, std::regex¨C15Cmatch



() .





— (static const regex), («/») .





, 30 -O3. . , ISO C++. , . ( ).





std::regex_match std::regex_search

, , ? . , cppreference, . , (, StackOverflow):





int main() {
    const string input = "ABC:1->   PQR:2;;;   XYZ:3<<<"s;
    const regex r(R"((\w+):(\w+);)");
    smatch m;

    assert(regex_match(input, m, r) == false);

    assert(regex_search(input, m, r) == true && m.ready() == true && m[1] == "PQR");

    return EXIT_SUCCESS;
}
      
      



std::regexmatch



true



, ​​ , std::regexsearch



true



, .





std::regex_iterator

std::regex_iterator



, .





#define C_ALL(X) cbegin(X), cend(X)

int main() {
    const string input = "ABC:1->   PQR:2;;;   XYZ:3<<<"s;
    const regex r(R"((\w+):(\d))");

    const vector<smatch> matches{
        sregex_iterator{C_ALL(input), r},
        sregex_iterator{}
    };

    assert(matches[0].str(0) == "ABC:1" 
        && matches[0].str(1) == "ABC" 
        && matches[0].str(2) == "1");

    assert(matches[1].str(0) == "PQR:2" 
        && matches[1].str(1) == "PQR" 
        && matches[1].str(2) == "2");

    assert(matches[2].str(0) == "XYZ:3" 
        && matches[2].str(1) == "XYZ" 
        && matches[2].str(2) == "3");

    return EXIT_SUCCESS;
}
      
      



( C++11) , , std::regex_interator



. C++14.





std::regex_token_iterator

std::regextokeniterator



— , 80% . std::regexiterator



. td::regexiterator



std::regextokeniterator



,





  • std::regexiterator



    .





  • std::regextokeniterator



    .





std::regextoken_iterator



.





#define C_ALL(X) cbegin(X), cend(X)

int main() {
    const string input = "ABC:1->   PQR:2;;;   XYZ:3<<<"s;
    const regex r(R"((\w+):(\d))");

    // Note: vector<string> here, unlike vector<smatch> as in std::regex_iterator
    const vector<string> full_match{
        sregex_token_iterator{C_ALL(input), r, 0}, // Mark `0` here i.e. whole regex match
        sregex_token_iterator{}
    };
    assert((full_match == decltype(full_match){"ABC:1", "PQR:2", "XYZ:3"}));

    const vector<string> cptr_grp_1st{
        sregex_token_iterator{C_ALL(input), r, 1}, // Mark `1` here i.e. 1st capture group
        sregex_token_iterator{}
    };
    assert((cptr_grp_1st == decltype(cptr_grp_1st){"ABC", "PQR", "XYZ"}));

    const vector<string> cptr_grp_2nd{
        sregex_token_iterator{C_ALL(input), r, 2}, // Mark `2` here i.e. 2nd capture group
        sregex_token_iterator{}
    };
    assert((cptr_grp_2nd == decltype(cptr_grp_2nd){"1", "2", "3"}));

    return EXIT_SUCCESS;
}
      
      



std::regex_token_iterator

#define C_ALL(X) cbegin(X), cend(X)

int main() {
    const string input = "ABC:1->   PQR:2;;;   XYZ:3<<<"s;
    const regex r(R"((\w+):(\d))");

    const vector<string> inverted{
        sregex_token_iterator{C_ALL(input), r, -1}, // `-1` = parts that are not matched
        sregex_token_iterator{}
    };
    assert((inverted == decltype(inverted){
                            "",
                            "->   ",
                            ";;;   ",
                            "<<<",
                        }));

    return EXIT_SUCCESS;
}
      
      



std::regex_replace

string transform_pair(string_view text, regex_constants::match_flag_type f = {}) {
    static const auto r = regex(R"((\w+):(\d))");
    return regex_replace(text.data(), r, "$2", f);
}

int main() {
    assert(transform_pair("ABC:1, PQR:2"s) == "1, 2"s);

    // Things that aren't matched are not copied
    assert(transform_pair("ABC:1, PQR:2"s, regex_constants::format_no_copy) == "12"s);
    return EXIT_SUCCESS;
}
      
      







, transformpair



std::regexconstants::formatnocopy



, , . std::regexconstant .





, , . , , -, , , . , ! std::regexreplace



:





int main() {
    const string input = "ABC:1->   PQR:2;;;   XYZ:3<<<"s;
    const regex r(R"(-|>|<|;| )");

    // Prints "ABC:1     PQR:2      XYZ:3   "
    regex_replace(ostreambuf_iterator<char>(cout), C_ALL(input), r, " ");

    return EXIT_SUCCESS;
}
      
      



(delimiter)

std::strtok



, :





#define C_ALL(X) cbegin(X), cend(X)

vector<string> split(const string& str, string_view pattern) {
    const auto r = regex(pattern.data());
    return vector<string>{
        sregex_token_iterator(C_ALL(str), r, -1),
        sregex_token_iterator()
    };
}

int main() {
    assert((split("/root/home/vishal", "/")
                == vector<string>{"", "root", "home", "vishal"}));
    return EXIT_SUCCESS;
}
      
      



string trim(string_view text) {
    static const auto r = regex(R"(\s+)");
    return regex_replace(text.data(), r, "");
}

int main() {
    assert(trim("12   3 4      5"s) == "12345"s);
    return EXIT_SUCCESS;
}
      
      



, ,

string join(const vector<string>& words, const string& delimiter) {
    return accumulate(next(begin(words)), end(words), words[0],
            [&delimiter](string& p, const string& word)
            {
                return p + delimiter + word;
            });
}

vector<string> lines_containing(const string& file, const vector<string>& words) {
    auto prefix = "^.*?\\b("s;
    auto suffix = ")\\b.*$"s;

    //  ^.*?\b(one|two|three)\b.*$
    const auto pattern = move(prefix) + join(words, "|") + move(suffix);

    ifstream        infile(file);
    vector<string>  result;

    for (string line; getline(infile, line);) {
        if(regex_match(line, regex(pattern))) {
            result.emplace_back(move(line));
        }
    }

    return result;
}

int main() {
   assert((lines_containing("test.txt", {"one","two"})
                                        == vector<string>{"This is one",
                                                          "This is two"}));
    return EXIT_SUCCESS;
}
/* test.txt
This is one
This is two
This is three
This is four
*/
      
      



, ^((?!(one|two|three)).)*$



.





namespace fs = std::filesystem;

vector<fs::directory_entry> find_files(const fs::path &path, string_view rg) {
    vector<fs::directory_entry> result;
    regex r(rg.data());
    copy_if(
        fs::recursive_directory_iterator(path),
        fs::recursive_directory_iterator(),
        back_inserter(result),
        [&r](const fs::directory_entry &entry) {
            return fs::is_regular_file(entry.path()) &&
                   regex_match(entry.path().filename().string(), r);
        });
    return result;
}

int main() {
    const auto dir        = fs::temp_directory_path();
    const auto pattern    = R"(\w+\.png)";
    const auto result     = find_files(fs::current_path(), pattern);
    for (const auto &entry : result) {
        cout << entry.path().string() << endl;
    }
    return EXIT_SUCCESS;
}
      
      



  • C++ .





  • , https://regex101.com. regex101, ( ).





  • , , , .





    :





    • (alternation), , com|net|org



      .





    • .





    • .





    • .





    • , .





, C++ . IDE ( vscode ) Linux. , . ( , ). , , .





, ; , .





— ( 19 2020 ) . Boost, CTRE Std. , . , CppCon 2018 2019 , .






"C++ Developer. Professional"









"Backend C++"












All Articles