-
Notifications
You must be signed in to change notification settings - Fork 11
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Read a single file from an archive #271
Comments
From my research I find that zip files have a central directory at the end of the file that can be read to find where the separate files are located. Remote files on zenodo can be partially read using the |
This is an other nice blog about the process: https://www.djmannion.net/partial_zip/index.html |
I have played around a bit and parsing a remote zip file is not so difficult the following code reads data from a remote zip over http. There is still plenty of room for improvements but files can already be read within seconds require(httr2)
#> Loading required package: httr2
system.time({
get_cd <- function(x = "https://zenodo.org/records/10671148/files/pilot2.zip") {
end <- request(x) |>
req_headers(Range = "bytes=-22") |>
req_perform() |>
purrr::chuck("body")
cd_start <- end[17:20] |>
rawToBits() |>
packBits("integer")
cd_len <- end[13:16] |>
rawToBits() |>
packBits("integer")
header <- request(x) |>
req_headers(Range = glue::glue("bytes={cd_start}-{cd_start+cd_len+22-1}")) |>
req_perform() |>
purrr::chuck("body")
return(header)
}
raw2ToInt <- function(x) {
c(x, as.raw(0x00), as.raw(0x00)) |>
rawToBits() |>
packBits("integer")
}
dd <- get_cd()
parsecd <- function(x) {
deparse <- c(
signature = 4, version_made_by = 2, version_need_to_extract = 2, bit_flag = 2,
compression_method = 2, last_mod_time = 2, last_mod_date = 2, crc32 = 4, compressed_size = 4, uncompressed_size = 4,
filename_length = 2, extra_field_length = 2, file_comment_length = 2, disk_num = 2, int_file_attr = 2, ext_file_attr = 4, rel_offset = 4
)
deparse <- unlist(purrr::map2(names(deparse), deparse, ~ rep(.x, each = .y)))
res <- list()
while (all(head(x, 4) == as.raw(c(0x50, 0x4b, 0x01, 0x02)))) {
l <- split(head(x, length(deparse)), deparse)
x <- tail(x, -length(deparse))
filename_length_int <- raw2ToInt(l$filename_length)
extra_field_length_int <- raw2ToInt(l$extra_field_length)
file_comment_length_int <- raw2ToInt(l$file_comment_length)
l[["filename"]] <- head(x, filename_length_int)
x <- tail(x, -filename_length_int)
l[["extra_field"]] <- head(x, extra_field_length_int)
if (extra_field_length_int != 0) {
x <- tail(x, -extra_field_length_int)
}
l[["file_comment"]] <- head(x, file_comment_length_int)
if (file_comment_length_int != 0) {
x <- tail(x, -file_comment_length_int)
}
# res <- c(res, list(tibble::tibble( lapply(l, list))))
res <- c(res, list(structure(lapply(l, list), row.names = c(
NA,
-1L
), class = "data.frame")))
}
rr <- dplyr::bind_rows(res) |>
dplyr::mutate(
filename = purrr::map_chr(filename, rawToChar),
rel_offset = purrr::map_int(rel_offset, ~ packBits(rawToBits(.x), "integer")),
compressed_size = purrr::map_int(compressed_size, ~ packBits(rawToBits(.x), "integer")),
uncompressed_size = purrr::map_int(uncompressed_size, ~ packBits(rawToBits(.x), "integer"))
)
rr
}
rr <- parsecd(dd)
rr |>
dplyr::mutate(next_rel_offset = dplyr::lead(rel_offset)) |>
dplyr::filter(grepl(pat = "media.csv", filename)) -> file
depcsv <- request("https://zenodo.org/records/10671148/files/pilot2.zip") |>
req_headers(Range = glue::glue("bytes={file$rel_offset}-{file$next_rel_offset-1}")) |>
req_perform() |>
purrr::chuck("body")
deparself <- c(
signature = 4, version_need_to_extract = 2, bit_flag = 2,
compression_method = 2, last_mod_time = 2, last_mod_date = 2,
crc32 = 4, compressed_size = 4, uncompressed_size = 4,
filename_length = 2, extra_field_length = 2
)
l <- list()
for (i in names(deparself)) {
l[[i]] <- head(depcsv, deparself[i])
depcsv <- tail(depcsv, -deparself[i])
}
filename_length_int <- raw2ToInt(l$filename_length)
extra_field_length_int <- raw2ToInt(l$extra_field_length)
l[["filename"]] <- head(depcsv, filename_length_int)
depcsv <- tail(depcsv, -filename_length_int)
l[["extra_field"]] <- head(depcsv, extra_field_length_int)
if (extra_field_length_int != 0) {
depcsv <- tail(depcsv, -extra_field_length_int)
}
c(as.raw(0x78), as.raw(0x01), depcsv) |>
zip::inflate() |>
purrr::chuck("output") -> rawInflated
a <- vroom::vroom(rawConnection(rawInflated))
})
#> Rows: 365 Columns: 11
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (6): mediaID, deploymentID, captureMethod, filePath, fileName, fileMedi...
#> lgl (4): filePublic, exifData, favorite, mediaComments
#> dttm (1): timestamp
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> user system elapsed
#> 2.697 0.053 3.217
dplyr::glimpse(a)
#> Rows: 365
#> Columns: 11
#> $ mediaID <chr> "10b0e4da-ca2d-4026-8574-bff8d15a3dcb", "5974ba99-73ed-4…
#> $ deploymentID <chr> "AWD_1_13082021_pilot 46576a8c-019a-4dd8-852e-86380e0973…
#> $ captureMethod <chr> "activityDetection", "activityDetection", "activityDetec…
#> $ timestamp <dttm> 2021-08-14 00:35:58, 2021-08-14 00:35:59, 2021-08-14 00…
#> $ filePath <chr> "media\\AWD_1_13082021_pilot 46576a8c-019a-4dd8-852e-863…
#> $ filePublic <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR…
#> $ fileName <chr> "10b0e4da-ca2d-4026-8574-bff8d15a3dcb.JPG", "5974ba99-73…
#> $ fileMediatype <chr> "image/jpeg", "image/jpeg", "image/jpeg", "image/jpeg", …
#> $ exifData <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
#> $ favorite <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
#> $ mediaComments <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, … note this does only support classical zip files and not the larger zip64 |
Bart sent us a message with an example where he was able to read a single
events.csv
from a 10Gb archive very quickly.However, two other files took longer:
In Python he had better luck:
The text was updated successfully, but these errors were encountered: