From d8154e02f03f8ece138bbf3657c93a3d216b3442 Mon Sep 17 00:00:00 2001 From: Enrico Zini Date: Wed, 24 Nov 2021 12:45:42 +0100 Subject: [PATCH] Started implementing the backend for the python JPEG scanner. refs: #277 --- arki/scan/jpeg.cc | 25 ++++++--------- arki/scan/jpeg.h | 8 ++--- python/scan.cc | 82 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+), 20 deletions(-) diff --git a/arki/scan/jpeg.cc b/arki/scan/jpeg.cc index 02a55145..fc461dd1 100644 --- a/arki/scan/jpeg.cc +++ b/arki/scan/jpeg.cc @@ -82,29 +82,22 @@ void JPEGScanner::set_blob_source(Metadata& md, std::shared_ptr { struct stat st; sys::stat(reader->segment().abspath, st); - stringstream note; + std::stringstream note; note << "Scanned from " << str::basename(reader->segment().relpath); md.add_note(note.str()); md.set_source(Source::createBlob(reader, 0, st.st_size)); } -std::shared_ptr JPEGScanner::scan_nc_data(const std::vector& data) -{ - sys::Tempfile tmpfd; - tmpfd.write_all_or_throw(data.data(), data.size()); - return scan_nc_file(tmpfd.name()); -} - std::shared_ptr JPEGScanner::scan_data(const std::vector& data) { - std::shared_ptr md = scan_nc_data(data); - md->set_source_inline("nc", metadata::DataManager::get().to_data("nc", std::vector(data))); + std::shared_ptr md = scan_jpeg_data(data); + md->set_source_inline("jpeg", metadata::DataManager::get().to_data("jpeg", std::vector(data))); return md; } std::shared_ptr JPEGScanner::scan_singleton(const std::string& abspath) { - return scan_nc_file(abspath); + return scan_jpeg_file(abspath); } bool JPEGScanner::scan_segment(std::shared_ptr reader, metadata_dest_func dest) @@ -113,10 +106,10 @@ bool JPEGScanner::scan_segment(std::shared_ptr reader, metadata auto st = sys::stat(reader->segment().abspath); if (!st) return true; if (S_ISDIR(st->st_mode)) - throw std::runtime_error("JPEGH5::scan_segment cannot be called on directory segments"); + throw std::runtime_error("JPEGScanner::scan_segment cannot be called on directory segments"); if (!st->st_size) return true; - auto md = scan_nc_file(reader->segment().abspath); + auto md = scan_jpeg_file(reader->segment().abspath); set_blob_source(*md, reader); return dest(md); } @@ -155,13 +148,13 @@ MockJPEGScanner::~MockJPEGScanner() delete engine; } -std::shared_ptr MockJPEGScanner::scan_nc_file(const std::string& pathname) +std::shared_ptr MockJPEGScanner::scan_jpeg_file(const std::string& pathname) { auto buf = sys::read_file(pathname); return engine->lookup(reinterpret_cast(buf.data()), buf.size()); } -std::shared_ptr MockJPEGScanner::scan_nc_data(const std::vector& data) +std::shared_ptr MockJPEGScanner::scan_jpeg_data(const std::vector& data) { return engine->lookup(data.data(), data.size()); } @@ -169,7 +162,7 @@ std::shared_ptr MockJPEGScanner::scan_nc_data(const std::vector(); }); } diff --git a/arki/scan/jpeg.h b/arki/scan/jpeg.h index 2cc47f82..4008a99e 100644 --- a/arki/scan/jpeg.h +++ b/arki/scan/jpeg.h @@ -18,8 +18,8 @@ class JPEGScanner : public Scanner void set_blob_source(Metadata& md, std::shared_ptr reader); protected: - virtual std::shared_ptr scan_nc_file(const std::string& pathname) = 0; - virtual std::shared_ptr scan_nc_data(const std::vector& data); + virtual std::shared_ptr scan_jpeg_file(const std::string& pathname) = 0; + virtual std::shared_ptr scan_jpeg_data(const std::vector& data) = 0; public: std::string name() const override { return "nc"; } @@ -36,8 +36,8 @@ class MockJPEGScanner : public JPEGScanner protected: MockEngine* engine; - std::shared_ptr scan_nc_file(const std::string& pathname) override; - std::shared_ptr scan_nc_data(const std::vector& data) override; + std::shared_ptr scan_jpeg_file(const std::string& pathname) override; + std::shared_ptr scan_jpeg_data(const std::vector& data) override; public: MockJPEGScanner(); diff --git a/python/scan.cc b/python/scan.cc index 67924ab6..7f14a4c1 100644 --- a/python/scan.cc +++ b/python/scan.cc @@ -15,6 +15,7 @@ #include "arki/scan/bufr.h" #include "arki/scan/odimh5.h" #include "arki/scan/netcdf.h" +#include "arki/scan/jpeg.h" #include "arki/nag.h" #include #ifdef HAVE_DBALLE @@ -492,6 +493,87 @@ class PythonNetCDFScanner : public arki::scan::NetCDFScanner }; +/* + * scan.jpeg module contents + */ + +PyObject* jpegscanner_object = nullptr; + +void load_jpegscanner_object() +{ + load_scanners(); + + // Get arkimet.scan.nc.BufrScanner + pyo_unique_ptr module(throw_ifnull(PyImport_ImportModule("arkimet.scan.jpeg"))); + pyo_unique_ptr cls(throw_ifnull(PyObject_GetAttrString(module, "Scanner"))); + pyo_unique_ptr obj(throw_ifnull(PyObject_CallFunction(cls, nullptr))); + + // Hold a reference to arki.python.BBox forever once loaded the first time + ncscanner_object = obj.release(); +} + + +class PythonJPEGScanner : public arki::scan::JPEGScanner +{ +protected: + std::shared_ptr scan_jpeg_file(const std::string& pathname) override + { + auto md = std::make_shared(); + + AcquireGIL gil; + if (!ncscanner_object) + load_jpegscanner_object(); + + pyo_unique_ptr pyfname(to_python(pathname)); + pyo_unique_ptr pymd((PyObject*)metadata_create(md)); + pyo_unique_ptr obj(throw_ifnull(PyObject_CallMethod( + ncscanner_object, "scan_file", "OO", pyfname.get(), pymd.get()))); + + // If use_count is > 1, it means we are potentially and unexpectedly + // holding all the metadata (and potentially their data) in memory, + // while a supported and important use case is to stream out one + // metadata at a time + pymd.reset(nullptr); + if (md.use_count() != 1) + arki::nag::warning("metadata use count after scanning is %ld instead of 1", md.use_count()); + + return md; + } + + std::shared_ptr scan_jpeg_data(const std::vector& data) override + { + auto md = std::make_shared(); + + AcquireGIL gil; + if (!ncscanner_object) + load_jpegscanner_object(); + + pyo_unique_ptr pydata(to_python(data)); + pyo_unique_ptr pymd((PyObject*)metadata_create(md)); + pyo_unique_ptr obj(throw_ifnull(PyObject_CallMethod( + ncscanner_object, "scan_data", "OO", pydata.get(), pymd.get()))); + + // If use_count is > 1, it means we are potentially and unexpectedly + // holding all the metadata (and potentially their data) in memory, + // while a supported and important use case is to stream out one + // metadata at a time + pymd.reset(nullptr); + if (md.use_count() != 1) + arki::nag::warning("metadata use count after scanning is %ld instead of 1", md.use_count()); + + return md; + } + +public: + PythonJPEGScanner() + { + } + virtual ~PythonJPEGScanner() + { + } +}; + + /* * scan.vm2 module functions */