tudocomp
– The TU Dortmund Compression Framework
EscapeMap.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include <array>
5 
6 namespace tdc {namespace io {
7  // All bytes that can be used for escaping.
8  // As a heuristic, this contains all illigal bytes in the utf8
9  // unicode encoding to prevent uneccessary escaping for text inputs
10  static const std::vector<uint8_t> ESCAPE_BYTE_POOL {
11  255,
12  254,
13  192,
14  193,
15  245,
16  246,
17  247,
18  248,
19  249,
20  250,
21  251,
22  252,
23  253,
24  };
25 
26  struct EscapeMap {
27  // The map is defined as such:
28  // For escaping:
29  // if input byte is escape_bytes[i], then
30  // escape_bytes[i] -> [replacement_bytes[0], replacement_bytes[i]]
31  // For unescaping:
32  // if input byte is replacement_bytes[0], then
33  // [replacement_bytes[0], replacement_bytes[i]] -> escape_bytes[i]
34 
35  std::vector<uint8_t> m_replacement_bytes;
36  std::vector<uint8_t> m_escape_bytes;
37  bool m_null_terminate = false;
38 
39  inline EscapeMap(const InputRestrictions& rest) {
40  m_escape_bytes = rest.escape_bytes();
41  m_null_terminate = rest.null_terminate();
42 
43  if (!m_escape_bytes.empty()) {
44  // Determine which bytes are legal to use for escaping
45  // by using the POOL constant and removing all
46  // bytes that need escaping
47  for (uint8_t byte : ESCAPE_BYTE_POOL) {
48  bool found = false;
49  for (uint8_t x : m_escape_bytes) {
50  if (x == byte) found = true;
51  }
52  if (!found && m_replacement_bytes.size() < (m_escape_bytes.size() + 1)) {
53  m_replacement_bytes.push_back(byte);
54  }
55  }
56 
57  // m_replacement_bytes[0] is our escape byte, so add it to m_escape_bytes
58  m_escape_bytes.insert(m_escape_bytes.begin(), m_replacement_bytes.at(0));
59  }
60 
61  // For the algorithm to work we need a escape byte, and a byte for each
62  // to-be-escaped byte
63  DCHECK_EQ(m_replacement_bytes.size(), m_escape_bytes.size());
64  }
65 
66  inline bool has_escape_bytes() const {
67  return !m_escape_bytes.empty();
68  }
69 
70  inline uint8_t escape_byte() const {
71  if (has_escape_bytes()) {
72  DCHECK_EQ(m_replacement_bytes.front(), m_escape_bytes.front());
73  return m_replacement_bytes.front();
74  } else {
75  return -1;
76  }
77  }
78 
79  inline const std::vector<uint8_t>& replacement_bytes() const {
80  return m_replacement_bytes;
81  }
82 
83  inline const std::vector<uint8_t>& escape_bytes() const {
84  return m_escape_bytes;
85  }
86 
87  inline bool null_terminate() const {
88  return m_null_terminate;
89  }
90 
91  };
92 
93  // For quick on-the-stack lookups of escaping chars
94  class FastEscapeMap {
95  std::array<uint8_t, 256> m_escape_map;
96  std::array<uint8_t, 256> m_escape_map_flag;
97  uint8_t m_escape_byte = 0;
98  bool m_null_terminate = false;
99  public:
100  inline FastEscapeMap() {
101  for (size_t i = 0; i < 256; i++) {
102  m_escape_map[i] = i;
103  m_escape_map_flag[i] = 0;
104  }
105  }
106 
107  inline FastEscapeMap(const EscapeMap& em): FastEscapeMap() {
108  auto& em_eb = em.escape_bytes();
109  auto& em_rb = em.replacement_bytes();
110  for (size_t i = 0; i < em_eb.size(); i++) {
111  m_escape_map[em_eb[i]] = em_rb[i];
112  m_escape_map_flag[em_eb[i]] = 1;
113  }
114  m_escape_byte = em.escape_byte();
115  m_null_terminate = em.null_terminate();
116  }
117 
118  inline uint8_t lookup_flag(size_t i) const {
119  DCHECK_LT(i, 256);
120  return m_escape_map_flag[i];
121  }
122 
123  inline bool lookup_flag_bool(size_t i) const {
124  return lookup_flag(i) != 0;
125  }
126 
127  inline uint8_t lookup_byte(size_t i) const {
128  DCHECK_LT(i, 256);
129  return m_escape_map[i];
130  }
131 
132  inline uint8_t escape_byte() const {
133  return m_escape_byte;
134  }
135 
136  inline bool null_terminate() const {
137  return m_null_terminate;
138  }
139  };
140 
141  // For quick on-the-stack lookups of unescaping chars
143  std::array<uint8_t, 256> m_unescape_map;
144  uint8_t m_escape_byte = 0;
145  bool m_null_terminate = false;
146  bool m_has_escape_bytes = false;
147  public:
148  inline FastUnescapeMap() {
149  for (size_t i = 0; i < 256; i++) {
150  m_unescape_map[i] = i;
151  }
152  }
153 
155  auto& em_eb = em.escape_bytes();
156  auto& em_rb = em.replacement_bytes();
157  for (size_t i = 0; i < em_eb.size(); i++) {
158  m_unescape_map[em_rb[i]] = em_eb[i];
159  }
160  m_escape_byte = em.escape_byte();
161  m_null_terminate = em.null_terminate();
162  m_has_escape_bytes = em.has_escape_bytes();
163  }
164 
165  inline uint8_t lookup_byte(size_t i) const {
166  DCHECK_LT(i, 256);
167  return m_unescape_map[i];
168  }
169 
170  inline uint8_t escape_byte() const {
171  return m_escape_byte;
172  }
173 
174  inline bool null_terminate() const {
175  return m_null_terminate;
176  }
177 
178  inline bool has_escape_bytes() const {
179  return m_has_escape_bytes;
180  }
181  };
182 }}
bool null_terminate() const
Definition: EscapeMap.hpp:174
bool has_escape_bytes() const
Definition: EscapeMap.hpp:178
Contains the text compression and encoding framework.
Definition: namespaces.hpp:11
EscapeMap(const InputRestrictions &rest)
Definition: EscapeMap.hpp:39
uint8_t escape_byte() const
Definition: EscapeMap.hpp:170
const std::vector< uint8_t > & escape_bytes() const
Definition: EscapeMap.hpp:83
Describes a set of restrictions placed on input data.
uint8_t escape_byte() const
Definition: EscapeMap.hpp:70
uint8_t lookup_byte(size_t i) const
Definition: EscapeMap.hpp:165
const std::vector< uint8_t > & escape_bytes() const
bool lookup_flag_bool(size_t i) const
Definition: EscapeMap.hpp:123
uint8_t lookup_flag(size_t i) const
Definition: EscapeMap.hpp:118
uint8_t lookup_byte(size_t i) const
Definition: EscapeMap.hpp:127
std::vector< uint8_t > m_replacement_bytes
Definition: EscapeMap.hpp:35
bool null_terminate() const
Definition: EscapeMap.hpp:136
std::vector< uint8_t > m_escape_bytes
Definition: EscapeMap.hpp:36
bool null_terminate() const
Definition: EscapeMap.hpp:87
const std::vector< uint8_t > & replacement_bytes() const
Definition: EscapeMap.hpp:79
FastEscapeMap(const EscapeMap &em)
Definition: EscapeMap.hpp:107
uint8_t escape_byte() const
Definition: EscapeMap.hpp:132
bool has_escape_bytes() const
Definition: EscapeMap.hpp:66
FastUnescapeMap(const EscapeMap &em)
Definition: EscapeMap.hpp:154