diff --git a/cpp/src/mlt/decode/string.hpp b/cpp/src/mlt/decode/string.hpp index 0996cd0ea..14322e500 100644 --- a/cpp/src/mlt/decode/string.hpp +++ b/cpp/src/mlt/decode/string.hpp @@ -64,7 +64,9 @@ class StringDecoder { throw std::runtime_error("Data stream missing logical type"); } dictType = streamMetadata->getLogicalStreamType()->getDictionaryType(); + /// can we only get 2 dictionarytypes in here? auto& target = (dictType == DictionaryType::SINGLE) ? dictionaryStream : symbolStream; + /// DictionaryTpye::FSST is not used? decodeRaw(tileData, target, streamMetadata->getByteLength(), /*consume=*/true); break; } @@ -209,53 +211,6 @@ class StringDecoder { decompressedLength); } - static std::vector decodeFSST(const std::uint8_t* symbols, - const std::size_t symbolCount, - const std::uint32_t* symbolLengths, - const std::size_t symbolLengthCount, - const std::uint8_t* compressedData, - const std::size_t compressedDataCount, - const std::size_t decompressedLength) { - std::vector output; - - if (decompressedLength > 0) { - output.resize(decompressedLength); - } - std::vector symbolOffsets(symbolLengthCount); - for (size_t i = 1; i < symbolLengthCount; i++) { - symbolOffsets[i] = symbolOffsets[i - 1] + symbolLengths[i - 1]; - } - - std::size_t idx = 0; - for (size_t i = 0; i < compressedDataCount; i++) { - const std::uint8_t symbolIndex = compressedData[i]; - - // 255 is our escape byte -> take the next symbol as it is - if (symbolIndex == 255) { - if (idx == output.size()) { - output.resize(output.size() * 2); - } - output[idx++] = compressedData[++i]; - } else if (symbolIndex < symbolLengthCount) { - const auto len = symbolLengths[symbolIndex]; - if (idx + len > output.size()) { - output.resize((output.size() + len) * 2); - } - const auto offset = symbolOffsets[symbolIndex]; - if (offset >= symbolCount) { - throw std::runtime_error("FSST decode: symbol index out of bounds"); - } - std::memcpy(&output[idx], &symbols[offset], len); - idx += len; - } else { - throw std::runtime_error("FSST decode: invalid symbol index"); - } - } - - output.resize(idx); - return output; - } - private: IntegerDecoder& intDecoder; @@ -317,6 +272,58 @@ class StringDecoder { out.push_back(dictionary[offsets[offsetIndex++]]); } } + + static std::vector decodeFSST(const std::uint8_t* symbols, + const std::size_t symbolCount, + const std::uint32_t* symbolLengths, + const std::size_t symbolLengthCount, + const std::uint8_t* compressedData, + const std::size_t compressedDataCount, + const std::size_t decompressedLength) { + std::vector output; + output.reserve(decompressedLength); + + std::vector symbolOffsets; + for (size_t i = 1; i < symbolLengthCount; i++) { + symbolOffsets[i] = symbolOffsets[i - 1] + symbolLengths[i - 1]; + } + + for (size_t i = 0; i < compressedDataCount; i++) { + const std::uint8_t symbolIndex = compressedData[i]; + // 255 is our escape byte -> take the next symbol as it is + if (symbolIndex == 255) { + /// this operation just copies the plain strings which are uncompressed + if (compressedData[i + 1] == 255) { + throw std::runtime_error("FSST decode: two escape sequences in a row detected index"); + } + output.push_back(compressedData[++i]); + } else if (symbolIndex < symbolLengthCount) { + const auto len = symbolLengths[symbolIndex]; + const auto offset = symbolOffsets[symbolIndex]; + if (offset >= symbolCount) { + throw std::runtime_error("FSST decode: symbol index out of bounds"); + } + const std::uint8_t* start = symbols + offset; + const std::uint8_t* end = start + len; + output.insert(output.end(), start, end); + } else { + throw std::runtime_error("FSST decode: invalid symbol index"); + } + } + return output; + /* the code below provides a faster lookup in my opinion. It is the "easy" example from the fsst paper. + This is currently not possible since the symbols are already tightly packed inside the byte stream for fsst + encoding. The trade-off was made for tighter packing for the symbol table + + We can decode 8bytes of string value via + void decodeSingleByteviaFSST(uint8_t in[], uint8_t out[], + uint64_t sym[256], uint8_t len[256]){ + uint8_t code = *in++; + *((uint64_t*)out) = sym[code]; + out += len[code]; + } + */ + } }; } // namespace mlt::decoder diff --git a/cpp/test/test_fsst.cpp b/cpp/test/test_fsst.cpp index f76860b9d..68a2742f4 100644 --- a/cpp/test/test_fsst.cpp +++ b/cpp/test/test_fsst.cpp @@ -5,7 +5,7 @@ #include #include -TEST(FSST, DecodeFromJava) { +TEST(FSST, DecodeFromJava_decode1) { const std::string expected = "AAAAAAABBBAAACCdddddEEEEEEfffEEEEAAAAAddddCC"; const std::vector symbols = {65, 65, 69, 69, 100, 100, 65, 66, 67, 69, 100, 102}; const std::vector symbolLengths = {2, 2, 2, 1, 1, 1, 1, 1, 1}; @@ -17,14 +17,69 @@ TEST(FSST, DecodeFromJava) { EXPECT_EQ(decoded.size(), expected.size()); EXPECT_EQ(0, memcmp(expected.c_str(), decoded.data(), expected.size())); +} + +TEST(FSST, DecodeFromJava_decode2) { + const std::string expected = "AAAAAAABBBAAACCdddddEEEEEEfffEEEEAAAAAddddCC"; + const std::vector symbols = {65, 65, 69, 69, 100, 100, 65, 66, 67, 69, 100, 102}; + const std::vector symbolLengths = {2, 2, 2, 1, 1, 1, 1, 1, 1}; + const std::vector javaCompressed = {0, 0, 0, 3, 4, 4, 4, 0, 3, 5, 5, 2, 2, 7, 1, + 1, 1, 8, 8, 8, 1, 1, 0, 0, 3, 2, 2, 5, 5}; // also make sure buffer growth works const auto decoded2 = mlt::decoder::StringDecoder::decodeFSST(symbols, symbolLengths, javaCompressed, 0); EXPECT_EQ(decoded2.size(), expected.size()); EXPECT_EQ(0, memcmp(expected.c_str(), decoded2.data(), expected.size())); +} + +TEST(FSST, DecodeFromJava_decode3) { + const std::string expected = "AAAAAAABBBAAACCdddddEEEEEEfffEEEEAAAAAddddCC"; + const std::vector symbols = {65, 65, 69, 69, 100, 100, 65, 66, 67, 69, 100, 102}; + const std::vector symbolLengths = {2, 2, 2, 1, 1, 1, 1, 1, 1}; + const std::vector javaCompressed = {0, 0, 0, 3, 4, 4, 4, 0, 3, 5, 5, 2, 2, 7, 1, + 1, 1, 8, 8, 8, 1, 1, 0, 0, 3, 2, 2, 5, 5}; const auto decoded3 = mlt::decoder::StringDecoder::decodeFSST( symbols, symbolLengths, javaCompressed, expected.size() / 2); EXPECT_EQ(decoded3.size(), expected.size()); EXPECT_EQ(0, memcmp(expected.c_str(), decoded3.data(), expected.size() / 2)); } + +TEST(FSST, DecodeFromJava_With_one_Escape_character) { + const std::string expected = "AAAAAAABBBAAACCdddddEEEEEEfffEEEEAAAAAddddCCk"; + const std::vector symbols = {65, 65, 69, 69, 100, 100, 65, 66, 67, 69, 100, 102}; + const std::vector symbolLengths = {2, 2, 2, 1, 1, 1, 1, 1, 1, 1}; + const std::vector javaCompressed = {0, 0, 0, 3, 4, 4, 4, 0, 3, 5, 5, 2, 2, 7, 1, 1, + 1, 8, 8, 8, 1, 1, 0, 0, 3, 2, 2, 5, 5, 255, 107}; + + const auto decoded = mlt::decoder::StringDecoder::decodeFSST( + symbols, symbolLengths, javaCompressed, expected.size()); + EXPECT_EQ(decoded.size(), expected.size()); + EXPECT_EQ(0, memcmp(expected.c_str(), decoded.data(), expected.size())); +} + +TEST(FSST, DecodeFromJava_With_multiple_Escape_characters) { + const std::string expected = "AAAAAAABBBAAACCdddddEEEEEEfffEEEEAAAAAddddCCkkk"; + const std::vector symbols = {65, 65, 69, 69, 100, 100, 65, 66, 67, 69, 100, 102}; + const std::vector symbolLengths = {2, 2, 2, 1, 1, 1, 1, 1, 1, 1}; + const std::vector javaCompressed = {0, 0, 0, 3, 4, 4, 4, 0, 3, 5, 5, 2, 2, 7, 1, 1, 1, 8, + 8, 8, 1, 1, 0, 0, 3, 2, 2, 5, 5, 255, 107, 255, 107, 255, 107}; + + const auto decoded = mlt::decoder::StringDecoder::decodeFSST( + symbols, symbolLengths, javaCompressed, expected.size()); + EXPECT_EQ(decoded.size(), expected.size()); + EXPECT_EQ(0, memcmp(expected.c_str(), decoded.data(), expected.size())); +} + +TEST(FSST, DecodeFromJava_With_one_single_escaped_character) { + const std::string expected = "k"; + const std::vector symbols = {65, 65, 69, 69, 100, 100, 65, 66, 67, 69, 100, 102}; + const std::vector symbolLengths = {2, 2, 2, 1, 1, 1, 1, 1, 1}; + ; + const std::vector javaCompressed = {255, 107}; + + const auto decoded = mlt::decoder::StringDecoder::decodeFSST( + symbols, symbolLengths, javaCompressed, expected.size()); + EXPECT_EQ(decoded.size(), expected.size()); + EXPECT_EQ(0, memcmp(expected.c_str(), decoded.data(), expected.size())); +} diff --git a/justfile b/justfile index 04dab4703..3c8e516b2 100755 --- a/justfile +++ b/justfile @@ -158,13 +158,13 @@ mkdocs-build: cd mkdocs && docker run --rm -v ${PWD}:/docs squidfunk/mkdocs-material build --strict # Build Java encoder and generate .mlt files for all .pbf files in test/fixtures -[working-directory: 'java'] +#[working-directory: 'java'] generate-expected-mlt: (cargo-install 'fd' 'fd-find') ./gradlew cli fd . ../test/fixtures --no-ignore --extension pbf --extension mvt -x {{quote(just_executable())}} generate-one-expected-mlt # Generate a single .mlt file for a given .mvt or .pbf file, assuming JAR is built -[working-directory: 'java'] +#[working-directory: 'java'] [private] generate-one-expected-mlt file: java \