sqlite引入的fts5扩展功能,会创建table_content/table_data等数据表,其中_data中是索引数据,可以尝试解析还原出很多有趣的内容。
至于数据结构后面有心情了再补充,直接贴代码。
此代码为C++类。
#pragma once #include <string> #include <vector> #include <map> #define _fts5GetVarint32(a,b) Fts5GetVarint32((const unsigned char*)a,(uint32_t*)&b) #define _fts5GetVarint(a,b) Fts5GetVarint((const unsigned char*)a,b) #define _fts5FastGetVarint32(a, iOff, nVal) { \ nVal = (a)[iOff++]; \ if( nVal & 0x80 ){ \ iOff--; \ iOff += _fts5GetVarint32(&(a)[iOff], nVal); \ } \ } class FTS5DataHandler { public: FTS5DataHandler() = default; #define FTS5_DATA_ID_B 16 /* Max seg id number 65535 */ #define FTS5_DATA_DLI_B 1 /* Doclist-index flag (1 bit) */ #define FTS5_DATA_HEIGHT_B 5 /* Max dlidx tree height of 32 */ #define FTS5_DATA_PAGE_B 31 /* Max page number of 2147483648 */ static void fts5DecodeRowid( int64_t iRowid, /* Rowid from %_data table */ int* piSegid, /* OUT: Segment id */ int* pbDlidx, /* OUT: Dlidx flag */ int* piHeight, /* OUT: Height */ int* piPgno /* OUT: Page number */ ) { *piPgno = (int)(iRowid & (((int64_t)1 << FTS5_DATA_PAGE_B) - 1)); iRowid >>= FTS5_DATA_PAGE_B; *piHeight = (int)(iRowid & (((int64_t)1 << FTS5_DATA_HEIGHT_B) - 1)); iRowid >>= FTS5_DATA_HEIGHT_B; *pbDlidx = (int)(iRowid & 0x0001); iRowid >>= FTS5_DATA_DLI_B; *piSegid = (int)(iRowid & (((int64_t)1 << FTS5_DATA_ID_B) - 1)); } static std::pair<std::string, bool> ParserFts5BlockOverData(const std::string& block) { int nDataLen = (int)block.length(); if (nDataLen < 4) return {}; const char* pData = block.c_str(); int iRowidOff = fts5GetU16((const unsigned char*)&pData[0]); int iPgidx = fts5GetU16((const unsigned char*)&pData[2]); //词条数据总长度 if (iPgidx > nDataLen) return {}; if (iPgidx == nDataLen) return { block.substr(4, iPgidx - 4), true }; int iOverdataLen = 0; _fts5GetVarint32(&pData[iPgidx], iOverdataLen); if (iOverdataLen <= 4) return {}; if (iOverdataLen >= nDataLen) return {}; return { block.substr(4, iOverdataLen - 4), false }; } void ParserFts5BlockData(const std::string& block, const std::vector<std::string>& overdatas) { if (block.length() < 4) return; const char* pData = block.data(); int nDataSize = (int)block.length(); int iRowidOff = fts5GetU16((const unsigned char*)&pData[0]); //上一条数据的pos list的over data len + 4 int iPgidx = fts5GetU16((const unsigned char*)&pData[2]); //词条数据总长度 if (iPgidx >= nDataSize) return; int iTermOff = 0; int iPgidxOff = iPgidx; iPgidxOff += _fts5GetVarint32(&pData[iPgidxOff], iTermOff); //上一条数据的doclist的over data assert(iTermOff); std::string last_block; int iLastTermLen = 0; std::vector<int> overdataLens; //解析词条数据 bool bFirst = true; std::string word; while (true) { int nTermLen; if (iPgidxOff < nDataSize) { iPgidxOff += _fts5GetVarint32(&pData[iPgidxOff], nTermLen); } else if (iTermOff < iPgidx) { //末尾块 nTermLen = iPgidx - iTermOff; if (!overdatas.empty()) { std::string overdata; for (const auto& data : overdatas) { overdata.append(data); overdataLens.emplace_back((int)data.length()); } last_block = block.substr(iTermOff, nTermLen); last_block.append(overdata); //修正变量 iLastTermLen = nTermLen; //用于重置overdata的rowid索引 nTermLen += (int)overdata.length(); iPgidx = nTermLen; iTermOff = 0; pData = last_block.data(); } } else { //数据解析到头了 break; } if (iTermOff + nTermLen > iPgidx) break; int iTermEnd = iTermOff + nTermLen; //fmtlog("term len:{}, offset:0x{:x}", nTermLen, iTermOff); //读取词条keep int iWordKeep = 0; if (!bFirst) { _fts5FastGetVarint32(pData, iTermOff, iWordKeep); if (iWordKeep <= 0) break; iWordKeep--; } //读取词条长度 int nWordLen; _fts5FastGetVarint32(pData, iTermOff, nWordLen); if (nWordLen <= 0) break; if (iTermOff + nWordLen >= iPgidx) break; //读取词条 if (bFirst && nWordLen > 1) { //跳过一个字节 if (nWordLen > 1) word.assign(&pData[iTermOff + 1], nWordLen - 1); else word.assign(&pData[iTermOff], nWordLen); //理论上不应该到这里 bFirst = false; } else { if (iWordKeep) { if (iWordKeep > (int)word.length()) break; word.assign(word, 0, iWordKeep); word.append(&pData[iTermOff], nWordLen); } else { word.assign(&pData[iTermOff], nWordLen); } } iTermOff += nWordLen; //fmtlog("word:{}({}), keep:{}", Utf82GBK(word), String2Hex(word), iWordKeep); // if (word == "j") // int b = 0; //解析docList int nRowID = 0; while (iTermOff < iTermEnd) { if (iLastTermLen && iTermOff >= iLastTermLen) { //重置rowid nRowID = 0; if (!overdataLens.empty()) { iLastTermLen += overdataLens[0]; overdataLens.erase(overdataLens.begin()); } else { iLastTermLen = 0; } } //读取rowid(docid) int nRowidTmp = 0; iTermOff += _fts5GetVarint32(&pData[iTermOff], nRowidTmp); //增量 nRowID += nRowidTmp; //fmtlog(" rowid:{}", nRowID); //读取pos list len int nPosListSize = 0; iTermOff += _fts5GetVarint32(&pData[iTermOff], nPosListSize); if (nPosListSize < 0) break; nPosListSize /= 2; //fmtlog(" pos list size:{}", nPosListSize); if (iTermOff + nPosListSize > iTermEnd) break; if (nPosListSize == 0) continue; //读取pos list auto poslist = fts5DecodePosList(&pData[iTermOff], nPosListSize); if (m_rowidsFilter.empty() || !m_rowidsFilter.HasKey(nRowID)) { //printf("word:%s(%s), keep:%d \n", Utf82GBK(word).c_str(), String2Hex(word).c_str(), iWordKeep); //printf(" rowid:%d \n", nRowID); //printf(" pos list size:%d \n", nPosListSize); //fmtlog(" pos list:{}", poslist); for (const auto& posinfo : poslist) { for (const auto& pos : posinfo.second) { m_result[nRowID][posinfo.first].emplace(pos, word); //auto ret = m_result[nRowID][posinfo.first].emplace(pos, word); // if (!ret.second) // printf("break;\n"); } } } iTermOff += nPosListSize; } } } inline void SetRowidFilter(int nRowid) { m_rowidsFilter.emplace(nRowid); } inline void ClearRowidFilter() { m_rowidsFilter.clear(); } std::map<int, std::map<int, std::string>> GetResult() { //整理词语转为真正的句子 //std::map<int, std::map<int, std::vector<std::pair<std::string, std::vector<int>>>>> m_result; // rowid colid content std::map<int, std::map<int, std::string>> ret; for (auto& kv : m_result) { int nRowid = kv.first; for (auto& rowinfo : kv.second) { int nColid = rowinfo.first; //合并字符串 std::string content; std::for_each(rowinfo.second.begin(), rowinfo.second.end(), [&content](const std::pair<int, std::string> &p) { content.append(p.second); }); ret[nRowid][nColid] = content; //ret[nRowid][nColid] = Utf82GBK(content); } } #ifdef _DEBUG //fmtlog("ret:{}", ret); #endif // _DEBUG return ret; } private: FKeySet<int> m_rowidsFilter; //结果集 // rowid colid index word std::map<int, std::map<int, std::map<int, std::string, std::less<>>>> m_result; //列id-pos list // col pos list static std::map<int, std::vector<int>> fts5DecodePosList(const char* pData, int nPosListSize) { std::map<int, std::vector<int>> ret; int nPos = 0; int nDocOff = 0; int nColID = 0; int nPosID = 0; while (nDocOff < nPosListSize) { int nPos; _fts5FastGetVarint32(pData, nDocOff, nPos); if (nPos <= 0) break; if (nPos == 1) { //列切换 int nColOff; _fts5FastGetVarint32(pData, nDocOff, nColOff); if (nColOff <= 0) break; nColID += nColOff; nPosID = 0; continue; } nPosID += nPos - 2; ret[nColID].emplace_back(nPosID); } return ret; } #define SLOT_2_0 0x001fc07f #define SLOT_4_2_0 0xf01fc07f static uint16_t fts5GetU16(const unsigned char* aIn) { return ((uint16_t)aIn[0] << 8) + aIn[1]; } static uint8_t Fts5GetVarint(const unsigned char* p, uint64_t* v) { uint32_t a, b, s; a = *p; /* a: p0 (unmasked) */ if (!(a & 0x80)) { *v = a; return 1; } p++; b = *p; /* b: p1 (unmasked) */ if (!(b & 0x80)) { a &= 0x7f; a = a << 7; a |= b; *v = a; return 2; } /* Verify that constants are precomputed correctly */ assert(SLOT_2_0 == ((0x7f << 14) | (0x7f))); assert(SLOT_4_2_0 == ((0xfU << 28) | (0x7f << 14) | (0x7f))); p++; a = a << 14; a |= *p; /* a: p0<<14 | p2 (unmasked) */ if (!(a & 0x80)) { a &= SLOT_2_0; b &= 0x7f; b = b << 7; a |= b; *v = a; return 3; } /* CSE1 from below */ a &= SLOT_2_0; p++; b = b << 14; b |= *p; /* b: p1<<14 | p3 (unmasked) */ if (!(b & 0x80)) { b &= SLOT_2_0; /* moved CSE1 up */ /* a &= (0x7f<<14)|(0x7f); */ a = a << 7; a |= b; *v = a; return 4; } /* a: p0<<14 | p2 (masked) */ /* b: p1<<14 | p3 (unmasked) */ /* 1:save off p0<<21 | p1<<14 | p2<<7 | p3 (masked) */ /* moved CSE1 up */ /* a &= (0x7f<<14)|(0x7f); */ b &= SLOT_2_0; s = a; /* s: p0<<14 | p2 (masked) */ p++; a = a << 14; a |= *p; /* a: p0<<28 | p2<<14 | p4 (unmasked) */ if (!(a & 0x80)) { /* we can skip these cause they were (effectively) done above in calc'ing s */ /* a &= (0x7f<<28)|(0x7f<<14)|(0x7f); */ /* b &= (0x7f<<14)|(0x7f); */ b = b << 7; a |= b; s = s >> 18; *v = ((uint64_t)s) << 32 | a; return 5; } /* 2:save off p0<<21 | p1<<14 | p2<<7 | p3 (masked) */ s = s << 7; s |= b; /* s: p0<<21 | p1<<14 | p2<<7 | p3 (masked) */ p++; b = b << 14; b |= *p; /* b: p1<<28 | p3<<14 | p5 (unmasked) */ if (!(b & 0x80)) { /* we can skip this cause it was (effectively) done above in calc'ing s */ /* b &= (0x7f<<28)|(0x7f<<14)|(0x7f); */ a &= SLOT_2_0; a = a << 7; a |= b; s = s >> 18; *v = ((uint64_t)s) << 32 | a; return 6; } p++; a = a << 14; a |= *p; /* a: p2<<28 | p4<<14 | p6 (unmasked) */ if (!(a & 0x80)) { a &= SLOT_4_2_0; b &= SLOT_2_0; b = b << 7; a |= b; s = s >> 11; *v = ((uint64_t)s) << 32 | a; return 7; } /* CSE2 from below */ a &= SLOT_2_0; p++; b = b << 14; b |= *p; /* b: p3<<28 | p5<<14 | p7 (unmasked) */ if (!(b & 0x80)) { b &= SLOT_4_2_0; /* moved CSE2 up */ /* a &= (0x7f<<14)|(0x7f); */ a = a << 7; a |= b; s = s >> 4; *v = ((uint64_t)s) << 32 | a; return 8; } p++; a = a << 15; a |= *p; /* a: p4<<29 | p6<<15 | p8 (unmasked) */ /* moved CSE2 up */ /* a &= (0x7f<<29)|(0x7f<<15)|(0xff); */ b &= SLOT_2_0; b = b << 8; a |= b; s = s << 4; b = p[-4]; b &= 0x7f; b = b >> 3; s |= b; *v = ((uint64_t)s) << 32 | a; return 9; } static int Fts5GetVarint32(const unsigned char* p, uint32_t* v) { uint32_t a, b; /* The 1-byte case. Overwhelmingly the most common. */ a = *p; /* a: p0 (unmasked) */ if (!(a & 0x80)) { /* Values between 0 and 127 */ *v = a; return 1; } /* The 2-byte case */ p++; b = *p; /* b: p1 (unmasked) */ if (!(b & 0x80)) { /* Values between 128 and 16383 */ a &= 0x7f; a = a << 7; *v = a | b; return 2; } /* The 3-byte case */ p++; a = a << 14; a |= *p; /* a: p0<<14 | p2 (unmasked) */ if (!(a & 0x80)) { /* Values between 16384 and 2097151 */ a &= (0x7f << 14) | (0x7f); b &= 0x7f; b = b << 7; *v = a | b; return 3; } /* A 32-bit varint is used to store size information in btrees. ** Objects are rarely larger than 2MiB limit of a 3-byte varint. ** A 3-byte varint is sufficient, for example, to record the size ** of a 1048569-byte BLOB or string. ** ** We only unroll the first 1-, 2-, and 3- byte cases. The very ** rare larger cases can be handled by the slower 64-bit varint ** routine. */ { uint64_t v64; uint8_t n; p -= 2; n = Fts5GetVarint(p, &v64); *v = ((uint32_t)v64) & 0x7FFFFFFF; assert(n > 3 && n <= 9); return n; } } };