Skip to content

Commit

Permalink
Remove UrlStats from forward index
Browse files Browse the repository at this point in the history
Related to #4 where the url features are static features that are
already computed in `generate_static_doc_features`.

This is a breaking change for previously created forward indexes that
include the `UrlStats` information. Currently there is no internal
versioning for indexes that are created. See #23.
  • Loading branch information
lgrz committed Jul 6, 2019
1 parent 8eee51c commit 49dba76
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 58 deletions.
35 changes: 2 additions & 33 deletions include/tesserae/forward_index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,6 @@
using namespace FastPForLib;
IntegerCODEC &codec = *CODECFactory::getFromName("streamvbyte");

class UrlStats {
uint16_t m_url_slash_count = 0;
uint16_t m_url_length = 0;

public:
UrlStats() = default;
UrlStats(uint16_t url_slash_count, uint16_t url_length)
: m_url_slash_count(url_slash_count), m_url_length(url_length) {}

uint16_t url_slash_count() const { return m_url_slash_count; }
uint16_t url_length() const { return m_url_length; }

template <class Archive>
void serialize(Archive &archive) {
archive(m_url_slash_count, m_url_length);
}
};

class Field {
uint16_t m_tag_count = 0;
uint16_t m_field_len = 0;
Expand Down Expand Up @@ -77,7 +59,6 @@ class Field {

class Document {
std::vector<uint16_t> m_fields;
UrlStats m_url_stats;
size_t m_num_terms = 0;
std::vector<uint32_t> m_terms;
std::vector<uint32_t> m_unique_terms;
Expand All @@ -88,12 +69,6 @@ class Document {
public:
Document() = default;

uint16_t url_slash_count() const { return m_url_stats.url_slash_count(); }

uint16_t url_length() const { return m_url_stats.url_length(); }

void set_url_stats(const UrlStats &url_stats) { m_url_stats = url_stats; }

uint32_t length() const { return m_terms.size(); }

const std::vector<uint32_t> terms() const { return m_terms; }
Expand Down Expand Up @@ -297,14 +272,8 @@ class Document {

template <class Archive>
void serialize(Archive &archive) {
archive(m_fields,
m_url_stats,
m_num_terms,
m_terms,
m_unique_terms,
m_freqs,
m_field_freqs,
m_field_stats);
archive(
m_fields, m_num_terms, m_terms, m_unique_terms, m_freqs, m_field_freqs, m_field_stats);
}
};

Expand Down
25 changes: 0 additions & 25 deletions src/create_forward_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,26 +16,6 @@
#include "tesserae/field_map.hpp"
#include "tesserae/forward_index.hpp"

size_t url_slash_count(const std::string &url) {
size_t count = 0;
const std::string proto = "://";
const std::string param_delim = "?";
size_t pos = url.find(proto);
size_t pos_q = url.find(param_delim);

if (pos_q < pos || std::string::npos == pos) {
pos = 0;
} else {
pos += proto.size();
}

while (std::string::npos != (pos = url.find("/", pos + 1, 1))) {
++count;
}

return count;
}

static const std::vector<std::string> _fields = {"body", "title", "heading", "inlink", "a"};

int main(int argc, char const *argv[]) {
Expand Down Expand Up @@ -82,11 +62,6 @@ int main(int argc, char const *argv[]) {
auto & doc_terms = list->terms();
Document document;

futures.push_back(std::async([&]() {
auto url = indri_env.documentMetadata(std::vector<lemur::api::DOCID_T>{docid}, "url");
document.set_url_stats({url_slash_count(url.at(0)), url.at(0).size()});
}));

std::set<uint32_t> unique_terms_set(doc_terms.begin(), doc_terms.end());
std::vector<uint32_t> unique_terms(unique_terms_set.begin(), unique_terms_set.end());
document.set_unique_terms(unique_terms);
Expand Down

0 comments on commit 49dba76

Please sign in to comment.