Skip to content

Commit

Permalink
Started implementing the backend for the python JPEG scanner. refs: #277
Browse files Browse the repository at this point in the history
  • Loading branch information
spanezz committed Nov 24, 2021
1 parent 4f6cb4d commit d8154e0
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 20 deletions.
25 changes: 9 additions & 16 deletions arki/scan/jpeg.cc
Original file line number Diff line number Diff line change
Expand Up @@ -82,29 +82,22 @@ void JPEGScanner::set_blob_source(Metadata& md, std::shared_ptr<segment::Reader>
{
struct stat st;
sys::stat(reader->segment().abspath, st);
stringstream note;
std::stringstream note;
note << "Scanned from " << str::basename(reader->segment().relpath);
md.add_note(note.str());
md.set_source(Source::createBlob(reader, 0, st.st_size));
}

std::shared_ptr<Metadata> JPEGScanner::scan_nc_data(const std::vector<uint8_t>& data)
{
sys::Tempfile tmpfd;
tmpfd.write_all_or_throw(data.data(), data.size());
return scan_nc_file(tmpfd.name());
}

std::shared_ptr<Metadata> JPEGScanner::scan_data(const std::vector<uint8_t>& data)
{
std::shared_ptr<Metadata> md = scan_nc_data(data);
md->set_source_inline("nc", metadata::DataManager::get().to_data("nc", std::vector<uint8_t>(data)));
std::shared_ptr<Metadata> md = scan_jpeg_data(data);
md->set_source_inline("jpeg", metadata::DataManager::get().to_data("jpeg", std::vector<uint8_t>(data)));
return md;
}

std::shared_ptr<Metadata> JPEGScanner::scan_singleton(const std::string& abspath)
{
return scan_nc_file(abspath);
return scan_jpeg_file(abspath);
}

bool JPEGScanner::scan_segment(std::shared_ptr<segment::Reader> reader, metadata_dest_func dest)
Expand All @@ -113,10 +106,10 @@ bool JPEGScanner::scan_segment(std::shared_ptr<segment::Reader> reader, metadata
auto st = sys::stat(reader->segment().abspath);
if (!st) return true;
if (S_ISDIR(st->st_mode))
throw std::runtime_error("JPEGH5::scan_segment cannot be called on directory segments");
throw std::runtime_error("JPEGScanner::scan_segment cannot be called on directory segments");
if (!st->st_size) return true;

auto md = scan_nc_file(reader->segment().abspath);
auto md = scan_jpeg_file(reader->segment().abspath);
set_blob_source(*md, reader);
return dest(md);
}
Expand Down Expand Up @@ -155,21 +148,21 @@ MockJPEGScanner::~MockJPEGScanner()
delete engine;
}

std::shared_ptr<Metadata> MockJPEGScanner::scan_nc_file(const std::string& pathname)
std::shared_ptr<Metadata> MockJPEGScanner::scan_jpeg_file(const std::string& pathname)
{
auto buf = sys::read_file(pathname);
return engine->lookup(reinterpret_cast<const uint8_t*>(buf.data()), buf.size());
}

std::shared_ptr<Metadata> MockJPEGScanner::scan_nc_data(const std::vector<uint8_t>& data)
std::shared_ptr<Metadata> MockJPEGScanner::scan_jpeg_data(const std::vector<uint8_t>& data)
{
return engine->lookup(data.data(), data.size());
}


void register_jpeg_scanner()
{
Scanner::register_factory("nc", [] {
Scanner::register_factory("jpeg", [] {
return std::make_shared<scan::MockJPEGScanner>();
});
}
Expand Down
8 changes: 4 additions & 4 deletions arki/scan/jpeg.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ class JPEGScanner : public Scanner
void set_blob_source(Metadata& md, std::shared_ptr<segment::Reader> reader);

protected:
virtual std::shared_ptr<Metadata> scan_nc_file(const std::string& pathname) = 0;
virtual std::shared_ptr<Metadata> scan_nc_data(const std::vector<uint8_t>& data);
virtual std::shared_ptr<Metadata> scan_jpeg_file(const std::string& pathname) = 0;
virtual std::shared_ptr<Metadata> scan_jpeg_data(const std::vector<uint8_t>& data) = 0;

public:
std::string name() const override { return "nc"; }
Expand All @@ -36,8 +36,8 @@ class MockJPEGScanner : public JPEGScanner
protected:
MockEngine* engine;

std::shared_ptr<Metadata> scan_nc_file(const std::string& pathname) override;
std::shared_ptr<Metadata> scan_nc_data(const std::vector<uint8_t>& data) override;
std::shared_ptr<Metadata> scan_jpeg_file(const std::string& pathname) override;
std::shared_ptr<Metadata> scan_jpeg_data(const std::vector<uint8_t>& data) override;

public:
MockJPEGScanner();
Expand Down
82 changes: 82 additions & 0 deletions python/scan.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "arki/scan/bufr.h"
#include "arki/scan/odimh5.h"
#include "arki/scan/netcdf.h"
#include "arki/scan/jpeg.h"
#include "arki/nag.h"
#include <grib_api.h>
#ifdef HAVE_DBALLE
Expand Down Expand Up @@ -492,6 +493,87 @@ class PythonNetCDFScanner : public arki::scan::NetCDFScanner
};


/*
* scan.jpeg module contents
*/

PyObject* jpegscanner_object = nullptr;

void load_jpegscanner_object()
{
load_scanners();

// Get arkimet.scan.nc.BufrScanner
pyo_unique_ptr module(throw_ifnull(PyImport_ImportModule("arkimet.scan.jpeg")));
pyo_unique_ptr cls(throw_ifnull(PyObject_GetAttrString(module, "Scanner")));
pyo_unique_ptr obj(throw_ifnull(PyObject_CallFunction(cls, nullptr)));

// Hold a reference to arki.python.BBox forever once loaded the first time
ncscanner_object = obj.release();
}


class PythonJPEGScanner : public arki::scan::JPEGScanner
{
protected:
std::shared_ptr<Metadata> scan_jpeg_file(const std::string& pathname) override
{
auto md = std::make_shared<Metadata>();

AcquireGIL gil;
if (!ncscanner_object)
load_jpegscanner_object();

pyo_unique_ptr pyfname(to_python(pathname));
pyo_unique_ptr pymd((PyObject*)metadata_create(md));
pyo_unique_ptr obj(throw_ifnull(PyObject_CallMethod(
ncscanner_object, "scan_file", "OO", pyfname.get(), pymd.get())));

// If use_count is > 1, it means we are potentially and unexpectedly
// holding all the metadata (and potentially their data) in memory,
// while a supported and important use case is to stream out one
// metadata at a time
pymd.reset(nullptr);
if (md.use_count() != 1)
arki::nag::warning("metadata use count after scanning is %ld instead of 1", md.use_count());

return md;
}

std::shared_ptr<Metadata> scan_jpeg_data(const std::vector<uint8_t>& data) override
{
auto md = std::make_shared<Metadata>();

AcquireGIL gil;
if (!ncscanner_object)
load_jpegscanner_object();

pyo_unique_ptr pydata(to_python(data));
pyo_unique_ptr pymd((PyObject*)metadata_create(md));
pyo_unique_ptr obj(throw_ifnull(PyObject_CallMethod(
ncscanner_object, "scan_data", "OO", pydata.get(), pymd.get())));

// If use_count is > 1, it means we are potentially and unexpectedly
// holding all the metadata (and potentially their data) in memory,
// while a supported and important use case is to stream out one
// metadata at a time
pymd.reset(nullptr);
if (md.use_count() != 1)
arki::nag::warning("metadata use count after scanning is %ld instead of 1", md.use_count());

return md;
}

public:
PythonJPEGScanner()
{
}
virtual ~PythonJPEGScanner()
{
}
};


/*
* scan.vm2 module functions
*/
Expand Down

0 comments on commit d8154e0

Please sign in to comment.