uriparser.cpp 13.4 KB
/* Copyright (C) 2019
 *
 * This file is part of the osdev components suite
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 2, or (at your option) any
 * later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
 */
#include "uriparser.h"

// std
#include <iostream>
#include <sstream>
#include <utility>

// gnu-c
#include <netdb.h>

// boost
#include <boost/algorithm/string/case_conv.hpp>
#include <boost/algorithm/string/split.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/regex.hpp>

// osdev::components::mqtt
#include "bimap.h"
#include "stringutils.h"
#include "uriutils.h"

// mlogic::common::logger
// #include "mlogic/common/logger/loggerprovider.h"
// #include "mlogic/common/invalidargumentexception.h"
// #include "mlogic/common/nullptrexception.h"
// #include "mlogic/common/systemexception.h"


using namespace osdev::components::mqtt;

namespace {

/**
 * @brief Copies an item from the from container to the to container.
 * @tparam TFrom The type of the container from which to copy.
 * @tparam TTo The type of the container to which to copy.
 * @param itemName The name of the item to copy.
 * @param from The container from which to copy.
 * @param to The cointainer to which to copy.
 * @param transformation Apply transformation function to the input. Default no transformation.
 */
template <typename TFrom, typename TTo>
void copyItem(const std::string& itemName, const TFrom& from, TTo& to, const std::function<std::string(const std::string&)>& transformation = std::function<std::string(const std::string&)>())
{
    if (transformation) {
        to.insert(std::make_pair(itemName, transformation(from[itemName])));
    }
    else {
        to.insert(std::make_pair(itemName, from[itemName]));
    }
}

const std::string& getItem(const std::map<std::string, std::string>& source, const std::string& itemName)
{
    static const std::string s_empty;
    const auto cit = source.find(itemName);
    if (cit != source.end()) {
        return cit->second;
    }
    return s_empty;
}

static const boost::bimap<char, std::string>& getReservedCharacterMap()
{
    static const auto s_lookupTable = makeBimap<char, std::string>(
        { { ':', percentEncode<':'>() },
            { '/', percentEncode<'/'>() },
            { '?', percentEncode<'?'>() },
            { '#', percentEncode<'#'>() },
            { '[', percentEncode<'['>() },
            { ']', percentEncode<']'>() },
            { '@', percentEncode<'@'>() },
            { '!', percentEncode<'!'>() },
            { '$', percentEncode<'$'>() },
            { '&', percentEncode<'&'>() },
            { '\'', percentEncode<'\''>() },
            { '(', percentEncode<'('>() },
            { ')', percentEncode<')'>() },
            { '*', percentEncode<'*'>() },
            { '+', percentEncode<'+'>() },
            { ',', percentEncode<','>() },
            { ';', percentEncode<';'>() },
            { '=', percentEncode<'='>() },

            { '"', percentEncode<'"'>() },
            { '%', percentEncode<'%'>() },
            { '-', percentEncode<'-'>() },
            { '.', percentEncode<'.'>() },
            { '<', percentEncode<'<'>() },
            { '>', percentEncode<'>'>() },
            { '\\', percentEncode<'\\'>() },
            { '^', percentEncode<'^'>() },
            { '_', percentEncode<'_'>() },
            { '`', percentEncode<'`'>() },
            { '{', percentEncode<'{'>() },
            { '|', percentEncode<'|'>() },
            { '}', percentEncode<'}'>() },
            { '~', percentEncode<'~'>() } });

    return s_lookupTable;
}

std::string decode(const std::string& in)
{
    static constexpr size_t encodingTokenSize = 3; // example: %20 encodes a space character.
    const auto& reservedLookup = getReservedCharacterMap();

    std::string out = in;
    std::size_t pos = 0;
    while ((pos = out.find('%', pos)) != std::string::npos) {
        if (pos + encodingTokenSize > out.size()) {
            // MLOGIC_COMMON_THROW(InvalidArgumentException, "Invalid encoding at end of string");
        }
        const auto cit = reservedLookup.right.find(out.substr(pos, 3));
        if (reservedLookup.right.end() != cit) {
            // string& replace (size_t pos,  size_t len,  size_t n, char c)
            // where n is the number of fill characters (1 in this case).
            out.replace(pos, encodingTokenSize, 1, cit->second);
        }
        ++pos;
    }
    return out;
}

std::string encode(const std::string& in)
{
    const auto& reservedLookup = getReservedCharacterMap();

    std::string out = in;
    for (size_t pos = 0; pos < out.size(); ++pos) {
        const auto cit = reservedLookup.left.find(out[pos]);
        if (reservedLookup.left.end() != cit) {
            out.replace(pos, 1, cit->second);
            pos += 2;
        }
    }
    return out;
}

} // anonymous


// static
ParsedUri UriParser::parse(const std::string& uri)
{
    // Before turning to regular expressions, the following ibraries were evaluated to achieve this functionality:
    //  Qt QUrlParser: http://doc.qt.io/qt-4.8/qurl.html
    //  cpp-netlib: https://github.com/cpp-netlib/uri
    //  uriparser: http://uriparser.sourceforge.net/
    // From the above, cpp-netlib was the most compelling because of its pending standardization for C++(17?).
    //  However, none of these libraries could handle strings for port service names, so a custom implementation seems necessary.
    //  As an additional validation step, one of the above libraries could be used after the port service name was replaced (see below).
    //
    // Split the uri in two stages. First break down the uri in its components: scheme (cannot be empty), authority (cannot be empty), path, query, fragment.
    // The path, query and fragment parts are optional. Because the scheme and authority part cannot be empty only a subset of uri's is handled by this function.
    // In the second stage the authority is parsed to get the hostname and the port. In order to use an ipv6 host address it must be
    // wrapped in brackets. The addresses are not validated whether they are correct. When a open bracket is found the part that is between brackets
    // must contain at least two colons. This is enough to discern something that resembles an ipv6 address from the other possibilities.
    static const std::string regexUriString(
        R"regex(^(?<scheme>[^:/?#]+)://(?<authority>[^/?#]+)(?<path>[^?#]*)(?<q1>\?(?<query>[^#]*))?(?<f1>#(?<fragment>.*))?)regex");

    // This regex only checks for non occurrence of the @ symbol since the input is the result from the regexUri where it is already validated that characters "/>#" do not exist
    // in the authority part. This regexAuthority uses the if then else construct "(?(?=regex)then|else)" to choose between ipv6 or something else.
    static const std::string regexAuthorityString(
        R"regex(^((?<user>[^:@]+)(:(?<password>[^@]+))?@)?(?(?=(?<ipv6>\[))\[(?=(.*:){2,}.*)(?<host>[^@]+)\]|(?<host>[^:@]+))(?<p1>:(?<port>[^:@]+))?$)regex");

    static boost::regex uriRegex(regexUriString);
    boost::cmatch whatUri;
    auto uriMatches = boost::regex_match(
        uri.c_str(),
        whatUri,
        uriRegex);
    if (!uriMatches)
    {
        // ErrorLogToFile("UriParser", "Invalid uri: '%1'", uri);
        // throw (InvalidArgumentException, "No match for the specified uri.");
    }

    static boost::regex authorityRegex(regexAuthorityString);
    boost::cmatch whatAuthority;
    std::string authority = whatUri["authority"];
    auto authorityMatches = boost::regex_match(
        authority.c_str(),
        whatAuthority,
        authorityRegex);
    if (!authorityMatches)
    {
        // ErrorToLogFile("UriParser", "Uri contains invalid authority part: %1", authority);
        // Throw (InvalidArgumentException, "Uri contains invalid authority part.");
    }

    static const auto toLower = [](const std::string& in) -> std::string { return boost::to_lower_copy(in); };

    ParsedUri parsedUri;
    copyItem("scheme", whatUri, parsedUri, toLower);
    copyItem("user", whatAuthority, parsedUri, &decode);
    copyItem("password", whatAuthority, parsedUri, &decode);
    copyItem("ipv6", whatAuthority, parsedUri); // Acts as a flag. Empty means not ipv6, not empty means ipv6 (this is not validated in the parse however!)
    copyItem("host", whatAuthority, parsedUri, toLower);
    copyItem("port", whatAuthority, parsedUri, toLower);
    copyItem("path", whatUri, parsedUri);
    copyItem("query", whatUri, parsedUri);
    copyItem("fragment", whatUri, parsedUri, &decode);

    return parsedUri;
}

// static
ParsedQuery UriParser::parseQuery(const ParsedUri& parsedUri)
{
    const auto cit = parsedUri.find("query");
    if (parsedUri.end() == cit) {
        return {};
    }

    const auto& queryString = cit->second;

    std::vector<std::string> keyValues;
    boost::algorithm::split(keyValues, queryString, [](char ch) { return ch == '&'; });

    std::map<std::string, std::string> retval;

    for (const auto& query : keyValues) {
        auto pos = query.find('=');
        if (std::string::npos != pos) {
            retval[decode(query.substr(0, pos))] = pos + 1 < query.size() ? decode(query.substr(pos + 1)) : "";
        }
    }
    return retval;
}

// static
ParsedPath UriParser::parsePath(const ParsedUri& parsedUri)
{
    const auto cit = parsedUri.find("path");
    if (parsedUri.end() == cit) {
        return {};
    }

    const auto& pathString = cit->second;

    ParsedPath pathElements;
    if (!pathString.empty()) {
        const auto path = pathString.substr(1);
        boost::algorithm::split(pathElements, path, [](char ch) { return ch == '/'; }); // empty string will lead to a single pathElement
        std::transform(pathElements.begin(), pathElements.end(), pathElements.begin(), &decode);
    }

    return pathElements;
}

// static
std::string UriParser::normalize(const std::string& uri)
{
    auto parsedUri = parse(uri);

    auto& portString = parsedUri["port"];
    if (!portString.empty() && !is_numeric(portString)) {
        auto portnumber = getPortnumber(portString);
        portString = boost::lexical_cast<std::string>(portnumber);
    }

    return toString(parsedUri);
}

// static
std::string UriParser::toString(const ParsedUri& parsedUri)
{
    const auto& schemeString = getItem(parsedUri, "scheme");
    const auto& hostString = getItem(parsedUri, "host");

    if( schemeString.empty() )
    {
        // Do something sensible
    }

    if( hostString.empty() )
    {
        // Do something sensible.
    }

    const auto& user = getItem(parsedUri, "user");
    const auto& password = getItem(parsedUri, "password");
    const auto& portString = getItem(parsedUri, "port");
    const auto& pathString = getItem(parsedUri, "path");
    const auto& queryString = getItem(parsedUri, "query");
    const auto& fragmentString = getItem(parsedUri, "fragment");

    // wrap hostname in brackets only if the incoming url uses them
    bool ipv6 = !getItem(parsedUri, "ipv6").empty();

    std::ostringstream oss;
    oss << schemeString << "://";

    if (!user.empty()) {
        oss << encode(user);
        if (!password.empty()) {
            oss << ":" << encode(password);
        }
        oss << "@";
    }

    if (ipv6) {
        oss << '[';
    }
    oss << hostString;
    if (ipv6) {
        oss << ']';
    }

    if (!portString.empty()) {
        oss << ':' << portString;
    }

    if (!pathString.empty()) {
        auto pathElements = UriParser::parsePath(parsedUri);
        for (const auto& element : pathElements) {
            oss << "/" << encode(element);
        }
    }
    if (!queryString.empty()) {
        auto queryElements = UriParser::parseQuery(parsedUri);
        oss << '?';
        for (auto cit = queryElements.cbegin(); queryElements.cend() != cit; ++cit) {
            oss << encode(cit->first) << '=' << encode(cit->second);
            if (next(cit) != queryElements.cend()) {
                oss << '&';
            }
        }
    }
    if (!fragmentString.empty()) {
        oss << '#' << encode(fragmentString);
    }
    return oss.str();
}

// static
int UriParser::getPortnumber(const std::string& service, const std::string& protocolName)
{
    const unsigned int bufSize = 1024;
    struct servent data;
    struct servent* result;
    char buf[bufSize]; // contains the strings that data points to
    ::getservbyname_r(service.c_str(), protocolName.c_str(), &data, buf, bufSize, &result);
    if (nullptr == result)
    {
        // ErrorLogToFile ("UriParser", "Could not determine the portnumber for the specified service: %1 for protocol: %2. Please check the portnumber configuration in /etc/services.", service, protocolName);
        // throw (?) (InvalidArgumentException, "Could not determine the portnumber for the specified service and/or protocol.");
    }

// htonx functions need -Wold-style-cast disabled
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wold-style-cast"
    return static_cast<int>(ntohs(static_cast<uint16_t>(data.s_port)));
#pragma GCC diagnostic pop
}