diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/classifier.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/classifier.py new file mode 100644 index 00000000..e307d9b7 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/classifier.py @@ -0,0 +1,155 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import torch +import torch.nn as nn +from .utils import _tranpose_and_gather_feat, _get_wh_feat, _get_4ps_feat, _normalized_ps +import torch.nn.functional as F + +import json +import cv2 +import os +from .transformer import Transformer +import math +import time +import random +import imgaug.augmenters as iaa +import time +import copy + +class Stacker(nn.Module): + def __init__(self, input_size, hidden_size, output_size, layers, heads=8, dropout=0.1): + super(Stacker, self).__init__() + self.logi_encoder = nn.Sequential( + nn.Linear(input_size, hidden_size), + nn.ReLU(inplace=True), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(inplace=True) #newly added + ) + self.tsfm = Transformer(2 * hidden_size, hidden_size, output_size, layers, heads, dropout) + + def forward(self, outputs, logi, mask = None, require_att = False): + logi_embeddings = self.logi_encoder(logi) + + cat_embeddings = torch.cat((logi_embeddings, outputs), dim=2) + + if mask is None: + if require_att: + stacked_axis, att = self.tsfm(cat_embeddings) + else: + stacked_axis = self.tsfm(cat_embeddings) + else: + stacked_axis = self.tsfm(cat_embeddings, mask=mask) + + if require_att: + return stacked_axis, att + else: + return stacked_axis + +class Processor(nn.Module): + def __init__(self, opt): + super(Processor, self).__init__() + + if opt.wiz_stacking: + self.stacker = Stacker(opt.output_size, opt.hidden_size, opt.output_size, opt.stacking_layers) + + #input_state, hidden_state, output_state, layers, heads, dropout + self.tsfm_axis = Transformer(opt.input_size, opt.hidden_size, opt.output_size, opt.tsfm_layers, opt.num_heads, opt.att_dropout) #original version + self.x_position_embeddings = nn.Embedding(opt.max_fmp_size, opt.hidden_size) + self.y_position_embeddings = nn.Embedding(opt.max_fmp_size, opt.hidden_size) + + self.opt = opt + + def forward(self, outputs, dets = None, batch = None, cc_match = None): #training version forward + # 'outputs' stands for the feature of cells + # mask = None + # att = None + + ''' + Constructing Features: + ''' + if batch is None: + # Inference Mode, the four corner features are gathered + # during bounding boxes decoding for simplicity (See ctdet_4ps_decode() in ./src/lib/model/decode.py). + + vis_feat = outputs + if dets is None: + feat = vis_feat + + else: + left_pe = self.x_position_embeddings(dets[:, :, 0]) + upper_pe = self.y_position_embeddings(dets[:, :, 1]) + right_pe = self.x_position_embeddings(dets[:, :, 2]) + lower_pe = self.y_position_embeddings(dets[:, :, 5]) + feat = vis_feat + left_pe + upper_pe + right_pe + lower_pe + + # !TODO: moving the processings here and uniform the feature construction code for training and inference. + + else: + #Training Mode + ind = batch['hm_ind'] + mask = batch['hm_mask'] #during training, the attention mask will be applied + output = outputs[-1] + pred = output['ax'] + ct_feat = _tranpose_and_gather_feat(pred, ind) + + if self.opt.wiz_2dpe: + cr_feat = _get_4ps_feat(batch['cc_match'], output) + cr_feat = cr_feat.sum(axis = 3) + vis_feat = ct_feat + cr_feat + + ps = _get_wh_feat(ind, batch, 'gt') + ps = _normalized_ps(ps, self.opt.max_fmp_size) + + left_pe = self.x_position_embeddings(ps[:, :, 0]) + upper_pe = self.y_position_embeddings(ps[:, :, 1]) + right_pe = self.x_position_embeddings(ps[:, :, 2]) + lower_pe = self.y_position_embeddings(ps[:, :, 5]) + + feat = vis_feat + left_pe + upper_pe + right_pe + lower_pe + + elif self.opt.wiz_4ps: + cr_feat = _get_4ps_feat(batch['cc_match'], output) + cr_feat = cr_feat.sum(axis = 3) + feat = ct_feat + cr_feat + + elif self.opt.wiz_vanilla: + feat = ct_feat + + ''' + Put Features into TSFM: + ''' + + if batch is None: + #Inference Mode + logic_axis = self.tsfm_axis(feat) + if self.opt.wiz_stacking: + stacked_axis = self.stacker(feat, logic_axis) + else: + #Training Mode + logic_axis = self.tsfm_axis(feat, mask = mask) + if self.opt.wiz_stacking: + stacked_axis = self.stacker(feat, logic_axis, mask = mask) + + if self.opt.wiz_stacking: + return logic_axis, stacked_axis + else: + return logic_axis + +def load_processor(model, model_path, optimizer=None, resume=False, lr=None, lr_step=None): + checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage) + print('loaded {}, epoch {}'.format(model_path, checkpoint['epoch'])) + state_dict = checkpoint['state_dict'] + model.load_state_dict(state_dict) + return model + +def _judge(box): + countx = len(list(set([box[0],box[2],box[4],box[6]]))) + county = len(list(set([box[1],box[3],box[5],box[7]]))) + if countx<2 or county<2: + return False + + return True + diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/data_parallel.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/data_parallel.py new file mode 100644 index 00000000..1a96c0d2 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/data_parallel.py @@ -0,0 +1,128 @@ +import torch +from torch.nn.modules import Module +from torch.nn.parallel.scatter_gather import gather +from torch.nn.parallel.replicate import replicate +from torch.nn.parallel.parallel_apply import parallel_apply + + +from .scatter_gather import scatter_kwargs + +class _DataParallel(Module): + r"""Implements data parallelism at the module level. + + This container parallelizes the application of the given module by + splitting the input across the specified devices by chunking in the batch + dimension. In the forward pass, the module is replicated on each device, + and each replica handles a portion of the input. During the backwards + pass, gradients from each replica are summed into the original module. + + The batch size should be larger than the number of GPUs used. It should + also be an integer multiple of the number of GPUs so that each chunk is the + same size (so that each GPU processes the same number of samples). + + See also: :ref:`cuda-nn-dataparallel-instead` + + Arbitrary positional and keyword inputs are allowed to be passed into + DataParallel EXCEPT Tensors. All variables will be scattered on dim + specified (default 0). Primitive types will be broadcasted, but all + other types will be a shallow copy and can be corrupted if written to in + the model's forward pass. + + Args: + module: module to be parallelized + device_ids: CUDA devices (default: all devices) + output_device: device location of output (default: device_ids[0]) + + Example:: + + >>> net = torch.nn.DataParallel(model, device_ids=[0, 1, 2]) + >>> output = net(input_var) + """ + + # TODO: update notes/cuda.rst when this class handles 8+ GPUs well + + def __init__(self, module, device_ids=None, output_device=None, dim=0, chunk_sizes=None): + super(_DataParallel, self).__init__() + + if not torch.cuda.is_available(): + self.module = module + self.device_ids = [] + return + + if device_ids is None: + device_ids = list(range(torch.cuda.device_count())) + if output_device is None: + output_device = device_ids[0] + self.dim = dim + self.module = module + self.device_ids = device_ids + self.chunk_sizes = chunk_sizes + self.output_device = output_device + if len(self.device_ids) == 1: + self.module.cuda(device_ids[0]) + + def forward(self, *inputs, **kwargs): + if not self.device_ids: + return self.module(*inputs, **kwargs) + inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids, self.chunk_sizes) + if len(self.device_ids) == 1: + return self.module(*inputs[0], **kwargs[0]) + replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) + outputs = self.parallel_apply(replicas, inputs, kwargs) + return self.gather(outputs, self.output_device) + + def replicate(self, module, device_ids): + return replicate(module, device_ids) + + def scatter(self, inputs, kwargs, device_ids, chunk_sizes): + return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim, chunk_sizes=self.chunk_sizes) + + def parallel_apply(self, replicas, inputs, kwargs): + return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) + + def gather(self, outputs, output_device): + return gather(outputs, output_device, dim=self.dim) + + +def data_parallel(module, inputs, device_ids=None, output_device=None, dim=0, module_kwargs=None): + r"""Evaluates module(input) in parallel across the GPUs given in device_ids. + + This is the functional version of the DataParallel module. + + Args: + module: the module to evaluate in parallel + inputs: inputs to the module + device_ids: GPU ids on which to replicate module + output_device: GPU location of the output Use -1 to indicate the CPU. + (default: device_ids[0]) + Returns: + a Variable containing the result of module(input) located on + output_device + """ + if not isinstance(inputs, tuple): + inputs = (inputs,) + + if device_ids is None: + device_ids = list(range(torch.cuda.device_count())) + + if output_device is None: + output_device = device_ids[0] + + inputs, module_kwargs = scatter_kwargs(inputs, module_kwargs, device_ids, dim) + if len(device_ids) == 1: + return module(*inputs[0], **module_kwargs[0]) + used_device_ids = device_ids[:len(inputs)] + replicas = replicate(module, used_device_ids) + outputs = parallel_apply(replicas, inputs, module_kwargs, used_device_ids) + return gather(outputs, output_device, dim) + +def DataParallel(module, device_ids=None, output_device=None, dim=0, chunk_sizes=None): + if chunk_sizes is None: + return torch.nn.DataParallel(module, device_ids, output_device, dim) + standard_size = True + for i in range(1, len(chunk_sizes)): + if chunk_sizes[i] != chunk_sizes[0]: + standard_size = False + if standard_size: + return torch.nn.DataParallel(module, device_ids, output_device, dim) + return _DataParallel(module, device_ids, output_device, dim, chunk_sizes) \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/decode.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/decode.py new file mode 100644 index 00000000..e07da2fe --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/decode.py @@ -0,0 +1,383 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import torch.nn as nn +from .utils import _gather_feat, _tranpose_and_gather_feat, _get_4ps_feat +import numpy as np +import shapely +import time +from shapely.geometry import Polygon, MultiPoint, Point + +def _nms(heat, name, kernel=3): + pad = (kernel - 1) // 2 + + hmax = nn.functional.max_pool2d( + heat, (kernel, kernel), stride=1, padding=pad) + #save_map(hmax.cpu().numpy()[0],name) + keep = (hmax == heat).float() + return heat * keep,keep + +def _topk_channel(scores, K=40): + + batch, cat, height, width = scores.size() + + topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K) + + topk_inds = topk_inds % (height * width) + topk_ys = (topk_inds / width).int().float() + topk_xs = (topk_inds % width).int().float() + + return topk_scores, topk_inds, topk_ys, topk_xs + +def _topk(scores, K=40, device=None): + #import ipdb + #ipdb.set_trace() + batch, cat, height, width = scores.size() + + topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K) + + topk_inds = topk_inds % (torch.Tensor([height]).to(torch.int64).to(device) * torch.Tensor([width]).to(torch.int64).to(device)) + topk_ys = (topk_inds / torch.Tensor([width]).to(device)).int().float() + topk_xs = (topk_inds % torch.Tensor([width]).to(torch.int64).to(device)).int().float() + + topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K) + topk_clses = (topk_ind // K).int() + topk_inds = _gather_feat( + topk_inds.view(batch, -1, 1), topk_ind).view(batch, K) + topk_ys = _gather_feat(topk_ys.view(batch, -1, 1), topk_ind).view(batch, K) + topk_xs = _gather_feat(topk_xs.view(batch, -1, 1), topk_ind).view(batch, K) + + return topk_score, topk_inds, topk_clses, topk_ys, topk_xs + +def corner_decode(mk, st_reg, mk_reg=None, K=400,device=None): + batch, cat, height, width = mk.size() + mk,keep = _nms(mk,'mk.0.maxpool') + scores, inds, clses, ys, xs = _topk(mk, K=K, device=device) + if mk_reg is not None: + reg = _tranpose_and_gather_feat(mk_reg, inds) + reg = reg.view(batch, K, 2) + xs = xs.view(batch, K, 1) + reg[:, :, 0:1] + ys = ys.view(batch, K, 1) + reg[:, :, 1:2] + else: + xs = xs.view(batch, K, 1) + 0.5 + ys = ys.view(batch, K, 1) + 0.5 + scores = scores.view(batch, K, 1) + st_Reg = _tranpose_and_gather_feat(st_reg, inds) + bboxes = torch.cat([xs - st_Reg[..., 0:1], + ys - st_Reg[..., 1:2], + xs - st_Reg[..., 2:3], + ys - st_Reg[..., 3:4], + xs - st_Reg[..., 4:5], + ys - st_Reg[..., 5:6], + xs - st_Reg[..., 6:7], + ys - st_Reg[..., 7:8]], dim=2) + corner_dict = {'scores': scores, 'inds': inds, 'ys': ys, 'xs': xs, 'gboxes': bboxes} + return scores, inds, ys, xs, bboxes, corner_dict + +def ctdet_4ps_decode(heat, wh, ax, cr, corner_dict=None, reg=None, cat_spec_wh=False, K=100, wiz_rev = False): + + # if wiz_rev : + # print('Grouping and Parsing ...') + batch, cat, height, width = heat.size() + device = heat.device + # heat = torch.sigmoid(heat) + # perform nms on heatmaps + heat,keep = _nms(heat,'hm.0.maxpool') + + scores, inds, clses, ys, xs = _topk(heat, K=K,device=device) + if reg is not None: + reg = _tranpose_and_gather_feat(reg, inds) + reg = reg.view(batch, K, 2) + xs = xs.view(batch, K, 1) + reg[:, :, 0:1] + ys = ys.view(batch, K, 1) + reg[:, :, 1:2] + else: + xs = xs.view(batch, K, 1) + 0.5 + ys = ys.view(batch, K, 1) + 0.5 + wh = _tranpose_and_gather_feat(wh, inds) + ax = _tranpose_and_gather_feat(ax, inds) + + if cat_spec_wh: + wh = wh.view(batch, K, cat, 8) + clses_ind = clses.view(batch, K, 1, 1).expand(batch, K, 1, 8).long() + wh = wh.gather(2, clses_ind).view(batch, K, 8) + else: + wh = wh.view(batch, K, 8) + clses = clses.view(batch, K, 1).float() + scores = scores.view(batch, K, 1) + + ''' + bboxes = torch.cat([xs - wh[..., 0:1], + ys - wh[..., 1:2], + xs + wh[..., 2:3], + ys - wh[..., 3:4], + xs + wh[..., 4:5], + ys + wh[..., 5:6], + xs - wh[..., 6:7], + ys + wh[..., 7:8]], dim=2) + ''' + + bboxes = torch.cat([xs - wh[..., 0:1], + ys - wh[..., 1:2], + xs - wh[..., 2:3], + ys - wh[..., 3:4], + xs - wh[..., 4:5], + ys - wh[..., 5:6], + xs - wh[..., 6:7], + ys - wh[..., 7:8]], dim=2) + + rev_time_s1 = time.time() + if wiz_rev : + bboxes_rev = bboxes.clone() + bboxes_cpu = bboxes.clone().cpu() + + gboxes = corner_dict['gboxes'] + gboxes_cpu = gboxes.cpu() + + num_bboxes = bboxes.shape[1] + num_gboxes = gboxes.shape[1] + + corner_xs = corner_dict['xs'] + corner_ys = corner_dict['ys'] + corner_scores = corner_dict['scores'] + + + for i in range(num_bboxes): + if scores[0,i,0] >= 0.2 : + count = 0 # counting the number of ends of st head in bbox i + for j in range(num_gboxes): + if corner_scores[0,j,0] >= 0.3: + #here comes to one pair of valid bbox and gbox + #step1 is there an overlap + + bbox = bboxes_cpu[0,i,:] + gbox = gboxes_cpu[0,j,:] + #rev_time_s3 = time.time() + if is_group_faster_faster(bbox, gbox): + #step2 find which corner point to refine, and do refine + cr_x = corner_xs[0,j,0] + cr_y = corner_ys[0,j,0] + + ind4ps = find4ps(bbox, cr_x, cr_y, device) + if bboxes_rev[0, i, 2*ind4ps] == bboxes[0, i, 2*ind4ps] and bboxes_rev[0, i, 2*ind4ps+1] == bboxes[0, i, 2*ind4ps+1]: + #first_shift + count = count + 1 + bboxes_rev[0, i, 2*ind4ps] = cr_x + bboxes_rev[0, i, 2*ind4ps + 1] = cr_y + else: + origin_x = bboxes[0, i, 2*ind4ps] + origin_y = bboxes[0, i, 2*ind4ps+1] + + old_x = bboxes_rev[0, i, 2*ind4ps] + old_y = bboxes_rev[0, i, 2*ind4ps+1] + + if dist(origin_x, origin_y, old_x, old_y) >= dist(origin_x, origin_y, cr_x, cr_y): + count = count + 1 + bboxes_rev[0, i, 2*ind4ps] = cr_x + bboxes_rev[0, i, 2*ind4ps + 1] = cr_y + else: + continue + else: + + continue + else: + break + if count <= 2: + scores[0,i,0] = scores[0,i,0] * 0.4 + else : + break + + if wiz_rev: + + cc_match = torch.cat([(bboxes_rev[:,:,0:1]) + width * torch.round(bboxes_rev[:,:,1:2]), + (bboxes_rev[:,:,2:3]) + width * torch.round(bboxes_rev[:,:,3:4]), + (bboxes_rev[:,:,4:5]) + width * torch.round(bboxes_rev[:,:,5:6]), + (bboxes_rev[:,:,6:7]) + width * torch.round(bboxes_rev[:,:,7:8])], dim=2) + + else: + cc_match = torch.cat([(xs - wh[..., 0:1]) + width * torch.round(ys - wh[..., 1:2]), + (xs - wh[..., 2:3]) + width * torch.round(ys - wh[..., 3:4]), + (xs - wh[..., 4:5]) + width * torch.round(ys - wh[..., 5:6]), + (xs - wh[..., 6:7]) + width * torch.round(ys - wh[..., 7:8])], dim=2) + + cc_match = torch.round(cc_match).to(torch.int64) + + cr_feat = _get_4ps_feat(cc_match, cr) + cr_feat = cr_feat.sum(axis = 3) + if wiz_rev: + detections = torch.cat([bboxes_rev, scores, clses], dim=2) + _, sorted_ind = torch.sort(scores, descending=True, dim=1) + sorted_inds = sorted_ind.expand(detections.size(0), detections.size(1), detections.size(2)) + detections = detections.gather(1, sorted_inds) + sorted_inds2 = sorted_ind.expand(detections.size(0), detections.size(1), ax.size(2)) + ax = ax.gather(1, sorted_inds2) + else: + + detections = torch.cat([bboxes, scores, clses], dim=2) + + return detections, keep, ax, cr_feat + +def wireless_decode(heat, wh, ax, cr, reg=None, cat_spec_wh=False, K=100): + + batch, cat, height, width = heat.size() + # heat = torch.sigmoid(heat) + # perform nms on heatmaps + heat,keep = _nms(heat,'hm.0.maxpool') + + scores, inds, clses, ys, xs = _topk(heat, K=K) + if reg is not None: + reg = _tranpose_and_gather_feat(reg, inds) + reg = reg.view(batch, K, 2) + xs = xs.view(batch, K, 1) + reg[:, :, 0:1] + ys = ys.view(batch, K, 1) + reg[:, :, 1:2] + else: + xs = xs.view(batch, K, 1) + 0.5 + ys = ys.view(batch, K, 1) + 0.5 + wh = _tranpose_and_gather_feat(wh, inds) + ax = _tranpose_and_gather_feat(ax, inds) + + if cat_spec_wh: + wh = wh.view(batch, K, cat, 8) + clses_ind = clses.view(batch, K, 1, 1).expand(batch, K, 1, 8).long() + wh = wh.gather(2, clses_ind).view(batch, K, 8) + else: + wh = wh.view(batch, K, 8) + clses = clses.view(batch, K, 1).float() + scores = scores.view(batch, K, 1) + ''' + bboxes = torch.cat([xs - wh[..., 0:1], + ys - wh[..., 1:2], + xs + wh[..., 2:3], + ys - wh[..., 3:4], + xs + wh[..., 4:5], + ys + wh[..., 5:6], + xs - wh[..., 6:7], + ys + wh[..., 7:8]], dim=2) + ''' + bboxes = torch.cat([xs - wh[..., 0:1], + ys - wh[..., 1:2], + xs - wh[..., 2:3], + ys - wh[..., 3:4], + xs - wh[..., 4:5], + ys - wh[..., 5:6], + xs - wh[..., 6:7], + ys - wh[..., 7:8]], dim=2) + + cc_match = torch.cat([(xs - wh[..., 0:1]) + width * torch.round(ys - wh[..., 1:2]), + (xs - wh[..., 2:3]) + width * torch.round(ys - wh[..., 3:4]), + (xs - wh[..., 4:5]) + width * torch.round(ys - wh[..., 5:6]), + (xs - wh[..., 6:7]) + width * torch.round(ys - wh[..., 7:8])], dim=2) + + cc_match = torch.round(cc_match).to(torch.int64) + + cr_feat = _get_4ps_feat(cc_match, cr) + cr_feat = cr_feat.sum(axis = 3) + + detections = torch.cat([bboxes, scores, clses], dim=2) + return detections, keep, ax, cr_feat + +def find4ps(bbox, x, y,device): + xs = torch.Tensor([bbox[0],bbox[2],bbox[4],bbox[6]]).to(device) + ys = torch.Tensor([bbox[1],bbox[3],bbox[5],bbox[7]]).to(device) + + dx = xs - x + dy = ys - y + + l = dx**2 + dy**2 + return torch.argmin(l) + +def dist(x1, y1, x2, y2): + dx = x1 - x2 + dy = y1 - y2 + l = dx**2 + dy**2 + return l + +def rect_inter(b1_x1, b1_y1, b1_x2, b1_y2, b2_x1, b2_y1, b2_x2, b2_y2): + if (b1_x1 <= b2_x1 and b2_x1 <= b1_x2) or (b1_x1 <= b2_x2 and b2_x2 <= b1_x2): + if (b1_y1 <= b2_y1 and b2_y1 <= b1_y2) or (b1_y1 <= b2_y2 and b2_y2 <= b1_y2): + return True + else: + return False + else: + return False + +def is_group_faster_faster(bbox, gbox): + bbox = bbox.view(4,2) + gbox = gbox.view(4,2) + + bbox_xmin, bbox_xmax, bbox_ymin, bbox_ymax = bbox[:,0].min(), bbox[:,0].max(), bbox[:,1].min(), bbox[:,1].max()#min(bbox_xs), max(bbox_xs), min(bbox_ys), max(bbox_ys) + gbox_xmin, gbox_xmax, gbox_ymin, gbox_ymax = gbox[:,0].min(), gbox[:,0].max(), gbox[:,1].min(), gbox[:,1].max() + + if bbox_xmin > gbox_xmax or gbox_xmin > bbox_xmax or bbox_ymin > gbox_ymax or gbox_ymin > bbox_ymax: + return False + else: + bpoly = Polygon(bbox) + + flag = 0 + for i in range(4): + p = Point(gbox[i]) + if p.within(bpoly): + flag = 1 + break + if flag == 0: + return False + else : + return True + +def ctdet_st_decode(heat, st, reg=None, cat_spec_wh=False, K=100): + batch, cat, height, width = heat.size() + heat,keep = _nms(heat,'hm.0.maxpool') + + scores, inds, clses, ys, xs = _topk(heat, K=K) + if reg is not None: + reg = _tranpose_and_gather_feat(reg, inds) + reg = reg.view(batch, K, 2) + xs = xs.view(batch, K, 1) + reg[:, :, 0:1] + ys = ys.view(batch, K, 1) + reg[:, :, 1:2] + else: + xs = xs.view(batch, K, 1) + 0.5 + ys = ys.view(batch, K, 1) + 0.5 + st = _tranpose_and_gather_feat(st, inds) + if cat_spec_wh: + st = st.view(batch, K, cat, 4) + clses_ind = clses.view(batch, K, 1, 1).expand(batch, K, 1, 4).long() + st = st.gather(2, clses_ind).view(batch, K, 4) + else: + st = st.view(batch, K, 4) + + return st + +def ctdet_decode(heat, wh, reg=None, cat_spec_wh=False, K=100): + batch, cat, height, width = heat.size() + + # heat = torch.sigmoid(heat) + # perform nms on heatmaps + heat = _nms(heat) + + scores, inds, clses, ys, xs = _topk(heat, K=K) + if reg is not None: + reg = _tranpose_and_gather_feat(reg, inds) + reg = reg.view(batch, K, 2) + xs = xs.view(batch, K, 1) + reg[:, :, 0:1] + ys = ys.view(batch, K, 1) + reg[:, :, 1:2] + else: + xs = xs.view(batch, K, 1) + 0.5 + ys = ys.view(batch, K, 1) + 0.5 + wh = _tranpose_and_gather_feat(wh, inds) + if cat_spec_wh: + wh = wh.view(batch, K, cat, 2) + clses_ind = clses.view(batch, K, 1, 1).expand(batch, K, 1, 2).long() + wh = wh.gather(2, clses_ind).view(batch, K, 2) + else: + wh = wh.view(batch, K, 2) + clses = clses.view(batch, K, 1).float() + scores = scores.view(batch, K, 1) + bboxes = torch.cat([xs - wh[..., 0:1] / 2, + ys - wh[..., 1:2] / 2, + xs + wh[..., 0:1] / 2, + ys + wh[..., 1:2] / 2], dim=2) + detections = torch.cat([bboxes, scores, clses], dim=2) + + return detections + diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/losses.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/losses.py new file mode 100644 index 00000000..ca0e35c5 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/losses.py @@ -0,0 +1,245 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import torch.nn as nn +from .utils import _flatten_and_gather_feat, _tranpose_and_gather_feat +import torch.nn.functional as F + +def _neg_loss(pred, gt): + ''' Modified focal loss. Exactly the same as CornerNet. + Runs faster and costs a little bit more memory + Arguments: + pred (batch x c x h x w) + gt_regr (batch x c x h x w) + ''' + pos_inds = gt.eq(1).float() + neg_inds = gt.lt(1).float() + + neg_weights = torch.pow(1 - gt, 4) + + loss = 0 + + pos_loss = torch.log(pred) * torch.pow(1 - pred, 2) * pos_inds + neg_loss = torch.log(1 - pred) * torch.pow(pred, 2) * neg_weights * neg_inds + + num_pos = pos_inds.float().sum() + pos_loss = pos_loss.sum() + neg_loss = neg_loss.sum() + + if num_pos == 0: + loss = loss - neg_loss + else: + loss = loss - (pos_loss + neg_loss) / num_pos + return loss + +def _reg_loss(regr, gt_regr, mask): + ''' L1 regression loss + Arguments: + regr (batch x max_objects x dim) + gt_regr (batch x max_objects x dim) + mask (batch x max_objects) + ''' + num = mask.float().sum() + mask = mask.unsqueeze(2).expand_as(gt_regr).float() + + regr = regr * mask + gt_regr = gt_regr * mask + + regr_loss = nn.functional.smooth_l1_loss(regr, gt_regr, size_average=False) + regr_loss = regr_loss / (num + 1e-4) + return regr_loss + +class AxisLoss(nn.Module): + def __init__(self): + super(AxisLoss, self).__init__() + + def forward(self, output, mask, ind, target, logi=None): + span_type = False + #computing vanilla axis loss + if logi is None: + pred = _tranpose_and_gather_feat(output, ind) + else: + pred = logi + + mask = mask.unsqueeze(2).float() + loss = F.l1_loss(pred * mask, target * mask, size_average=False) + loss = loss / (4*(mask.sum() + 1e-4)) + + return loss + +class FocalLoss(nn.Module): + '''nn.Module warpper for focal loss''' + def __init__(self): + super(FocalLoss, self).__init__() + self.neg_loss = _neg_loss + + def forward(self, out, target): + return self.neg_loss(out, target) + +class RegLoss(nn.Module): + '''Regression loss for an output tensor + Arguments: + output (batch x dim x h x w) + mask (batch x max_objects) + ind (batch x max_objects) + target (batch x max_objects x dim) + ''' + def __init__(self): + super(RegLoss, self).__init__() + + def forward(self, output, mask, ind, target): + pred = _tranpose_and_gather_feat(output, ind) + loss = _reg_loss(pred, target, mask) + return loss + +class RegL1Loss(nn.Module): + def __init__(self): + super(RegL1Loss, self).__init__() + + def forward(self, output, mask, ind, target): + pred = _tranpose_and_gather_feat(output, ind) + mask = mask.unsqueeze(2).expand_as(pred).float() + # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean') + loss = F.l1_loss(pred * mask, target * mask, size_average=False) + loss = loss / (mask.sum() + 1e-4) + return loss + +class PairLoss(nn.Module): + def __init__(self): + super(PairLoss, self).__init__() + + def forward(self, output1, ind1, output2, ind2, mask, mask_cro, ctr_cro_ind, target1, target2, hm_ctxy): + + pred1 = _tranpose_and_gather_feat(output1, ind1) #bxmx8 + pred2 = _tranpose_and_gather_feat(output2, ind2) #bxnx8 + pred2_tmp = pred2 + target2_tmp = target2 + mask = mask.unsqueeze(2).expand_as(pred1).float() + + b = pred1.size(0) + m = pred1.size(1) + n = pred2.size(1) + pred2 = pred2.view(b,4*n,2) + ctr_cro_ind = ctr_cro_ind.unsqueeze(2).expand(b,4*m,2) + pred2 = pred2.gather(1,ctr_cro_ind).view(b,m,8) #bxmx8 + target2 = target2.view(b,4*n,2).gather(1,ctr_cro_ind).view(b,m,8) + + delta = (torch.abs(pred1-target1)+torch.abs(pred2-target2)) / (torch.abs(target1) + 1e-4) + delta = delta * delta + delta_mask = (~delta.gt(1.0))*1#1 - delta.gt(1.0) + delta = delta*(delta_mask.float())+(1-delta_mask).float() + delta = (-3.14)*delta + weight = 1 - torch.exp(delta) + + loss1 = F.l1_loss(pred1 * mask * weight, target1 * mask * weight, size_average=False) + loss2 = F.l1_loss(pred2 * mask * weight, target2 * mask * weight, size_average=False) + loss1 = loss1 / (mask.sum() + 1e-4) + loss2 = loss2 / (mask.sum() + 1e-4) + + mask1 = (target2_tmp==0) + mask_cro = mask_cro.unsqueeze(2).expand(b,n,8) + MASK = (mask1==mask_cro).float() + loss3 = F.l1_loss(pred2_tmp * MASK, target2_tmp * MASK, size_average=False) + loss3 = loss3 / (mask.sum() + 1e-4) + + return loss1, 0.5 * loss2 + 0.2 * loss3 + +class NormRegL1Loss(nn.Module): + def __init__(self): + super(NormRegL1Loss, self).__init__() + + def forward(self, output, mask, ind, target): + pred = _tranpose_and_gather_feat(output, ind) + mask = mask.unsqueeze(2).expand_as(pred).float() + # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean') + pred = pred / (target + 1e-4) + target = target * 0 + 1 + loss = F.l1_loss(pred * mask, target * mask, size_average=False) + loss = loss / (mask.sum() + 1e-4) + return loss + +class RegWeightedL1Loss(nn.Module): + def __init__(self): + super(RegWeightedL1Loss, self).__init__() + + def forward(self, output, mask, ind, target): + pred = _tranpose_and_gather_feat(output, ind) + mask = mask.float() + # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean') + loss = F.l1_loss(pred * mask, target * mask, size_average=False) + loss = loss / (mask.sum() + 1e-4) + return loss + +class L1Loss(nn.Module): + def __init__(self): + super(L1Loss, self).__init__() + + def forward(self, output, mask, ind, target): + pred = _tranpose_and_gather_feat(output, ind) + mask = mask.unsqueeze(2).expand_as(pred).float() + loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean') + return loss + + +def compute_res_loss(output, target): + return F.smooth_l1_loss(output, target, reduction='elementwise_mean') + + +def _axis_eval(output, mask, ind, target, logi=None, mode = None): + if logi is None: + pred = _tranpose_and_gather_feat(output, ind) + else: + pred = logi + + dev = pred - target + dev = torch.abs(dev) + + one = torch.ones(dev.shape).cuda() + zero = torch.zeros(dev.shape).cuda() + + true_vec = torch.where(dev < 0.5, one, zero) + #true = torch.sum((true_vec * mask.unsqueeze(2)) == 1) + true = torch.sum(true_vec.sum(axis=2)*mask == 4) + total = (mask == 1).sum() + true = true.to(torch.float32) + total = total.to(torch.float32) + + acc = true/total + + if acc is None: + acc = 0 + + if mode == 'full': + pred_int = process_logi(pred) + + pred_pair = _make_pair_feat(pred_int) + target_pair = _make_pair_feat(target) + mask_pair = _make_pair_feat(ind) + + #pred_int = pred_int.expand() + + mask_1 = mask_pair[:,:,:,0] + mask_2 = mask_pair[:,:,:,1] + + at_vec_h = (target_pair[:,:,:,2]<=target_pair[:,:,:,6]) & (target_pair[:,:,:,6]<=target_pair[:,:,:,3]) & (mask_1 != 0) & (mask_2 != 0) + at_vec_w = (target_pair[:,:,:,0]<=target_pair[:,:,:,4]) & (target_pair[:,:,:,4]<=target_pair[:,:,:,1]) & (mask_1 != 0) & (mask_2 != 0) + + ap_vec_h = (pred_pair[:,:,:,2]<=pred_pair[:,:,:,6]) & (pred_pair[:,:,:,6]<=pred_pair[:,:,:,3]) & (mask_1 != 0) & (mask_2 != 0) + ap_vec_w = (pred_pair[:,:,:,0]<=pred_pair[:,:,:,4]) & (pred_pair[:,:,:,4]<=pred_pair[:,:,:,1]) & (mask_1 != 0) & (mask_2 != 0) + + tp_h = at_vec_h & ap_vec_h + tp_w = at_vec_w & ap_vec_w + + ap = torch.sum(ap_vec_h) + torch.sum(ap_vec_w) + at = torch.sum(at_vec_h) + torch.sum(at_vec_w) + tp = torch.sum(tp_h) + torch.sum(tp_w) + + pre = tp/(ap + 1e-4) + rec = tp/(at + 1e-4) + + return acc , pre, rec + else: + return acc + diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/model.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/model.py new file mode 100644 index 00000000..dfef0dd1 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/model.py @@ -0,0 +1,151 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torchvision.models as models +import torch +import torch.nn as nn +import os + +from .networks.fpn_resnet import get_pose_net_fpn +from .networks.fpn_resnet_half import get_pose_net_fpn_half + +from .networks.fpn_mask_resnet import get_pose_net_fpn_mask +from .networks.fpn_mask_resnet_half import get_pose_net_fpn_mask_half + +from .networks.pose_dla_dcn import get_pose_net as get_dla_dcn + + +_model_factory = { + 'dla': get_dla_dcn, + 'resfpn':get_pose_net_fpn, + 'resfpnhalf': get_pose_net_fpn_half, + 'resfpnmask':get_pose_net_fpn_mask, + 'resfpnmaskhalf':get_pose_net_fpn_mask_half +} + +def create_model(arch, heads, head_conv): + num_layers = int(arch[arch.find('_') + 1:]) if '_' in arch else 0 + arch = arch[:arch.find('_')] if '_' in arch else arch + get_model = _model_factory[arch] + model = get_model(num_layers=num_layers, heads=heads, head_conv=head_conv) + return model + +def load_model(model, model_path, optimizer=None, resume=False, + lr=None, lr_step=None): + start_epoch = 0 + + checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage) + + print('loaded {}, epoch {}'.format(model_path, checkpoint['epoch'])) + state_dict_ = checkpoint['state_dict'] + state_dict = {} + + # convert data_parallal to model + for k in state_dict_: + if k.startswith('module') and not k.startswith('module_list'): + state_dict[k[7:]] = state_dict_[k] + else: + state_dict[k] = state_dict_[k] + print(model, file=open('model.txt', 'a')) + model_state_dict = model.state_dict() + + # check loaded parameters and created model parameters + for k in state_dict: + if k in model_state_dict: + if state_dict[k].shape != model_state_dict[k].shape: + print('Skip loading parameter {}, required shape{}, '\ + 'loaded shape{}.'.format( + k, model_state_dict[k].shape, state_dict[k].shape)) + state_dict[k] = model_state_dict[k] + else: + print('Drop parameter {}.'.format(k)) + for k in model_state_dict: + if not (k in state_dict): + print('No param {}.'.format(k)) + state_dict[k] = model_state_dict[k] + model.load_state_dict(state_dict, strict=False) + + # resume optimizer parameters + if optimizer is not None and resume: + if 'optimizer' in checkpoint: + optimizer.load_state_dict(checkpoint['optimizer']) + start_epoch = checkpoint['epoch'] + start_lr = lr + for step in lr_step: + if start_epoch >= step: + start_lr *= 0.1 + for param_group in optimizer.param_groups: + param_group['lr'] = start_lr + print('Resumed optimizer with start lr', start_lr) + else: + print('No optimizer parameters in checkpoint.') + if optimizer is not None: + return model, optimizer, start_epoch + else: + return model + +def load_multiple(model, model_path, optimizer=None, resume=False, + lr=None, lr_step=None): + start_epoch = 0 + + checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage) + + print('loaded {}, epoch {}'.format(model_path, checkpoint['epoch'])) + state_dict_ = checkpoint['state_dict'] + state_dict = {} + + # convert data_parallal to model + for k in state_dict_: + if k.startswith('module') and not k.startswith('module_list'): + state_dict[k[7:]] = state_dict_[k] + else: + state_dict[k] = state_dict_[k] + model_state_dict = model.state_dict() + + # check loaded parameters and created model parameters + for k in state_dict: + if k in model_state_dict: + if state_dict[k].shape != model_state_dict[k].shape: + print('Skip loading parameter {}, required shape{}, '\ + 'loaded shape{}.'.format( + k, model_state_dict[k].shape, state_dict[k].shape)) + state_dict[k] = model_state_dict[k] + else: + print('Drop parameter {}.'.format(k)) + for k in model_state_dict: + if not (k in state_dict): + print('No param {}.'.format(k)) + state_dict[k] = model_state_dict[k] + model.load_state_dict(state_dict, strict=False) + + # resume optimizer parameters + if optimizer is not None and resume: + if 'optimizer' in checkpoint: + optimizer.load_state_dict(checkpoint['optimizer']) + start_epoch = checkpoint['epoch'] + start_lr = lr + for step in lr_step: + if start_epoch >= step: + start_lr *= 0.1 + for param_group in optimizer.param_groups: + param_group['lr'] = start_lr + print('Resumed optimizer with start lr', start_lr) + else: + print('No optimizer parameters in checkpoint.') + if optimizer is not None: + return model, optimizer, start_epoch + else: + return model + +def save_model(path, epoch, model, optimizer=None): + if isinstance(model, torch.nn.DataParallel): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + data = {'epoch': epoch, + 'state_dict': state_dict} + if not (optimizer is None): + data['optimizer'] = optimizer.state_dict() + torch.save(data, path) + diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/PKG-INFO b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/PKG-INFO new file mode 100644 index 00000000..0e4d91d8 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/PKG-INFO @@ -0,0 +1,7 @@ +Metadata-Version: 2.1 +Name: DCNv2 +Version: 0.1 +Summary: deformable convolutional networks +Home-page: https://github.com/charlesshang/DCNv2 +Author: charlesshang +License-File: LICENSE diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/SOURCES.txt b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/SOURCES.txt new file mode 100644 index 00000000..f4eeb2b5 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/SOURCES.txt @@ -0,0 +1,13 @@ +LICENSE +setup.py +/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.cpp +/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.cpp +/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.cpp +/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.cpp +/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.cu +/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.cu +/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.cu +DCNv2.egg-info/PKG-INFO +DCNv2.egg-info/SOURCES.txt +DCNv2.egg-info/dependency_links.txt +DCNv2.egg-info/top_level.txt \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/dependency_links.txt b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/dependency_links.txt new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/top_level.txt b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/top_level.txt new file mode 100644 index 00000000..8eafb8a8 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/top_level.txt @@ -0,0 +1 @@ +_ext diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/LICENSE b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/LICENSE new file mode 100644 index 00000000..b2e3b520 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2019, Charles Shang +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/__init__.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/_ext.cpython-37m-x86_64-linux-gnu.so b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/_ext.cpython-37m-x86_64-linux-gnu.so new file mode 100755 index 00000000..33ff3ea9 Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/_ext.cpython-37m-x86_64-linux-gnu.so differ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/lib.linux-x86_64-cpython-37/_ext.cpython-37m-x86_64-linux-gnu.so b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/lib.linux-x86_64-cpython-37/_ext.cpython-37m-x86_64-linux-gnu.so new file mode 100755 index 00000000..33ff3ea9 Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/lib.linux-x86_64-cpython-37/_ext.cpython-37m-x86_64-linux-gnu.so differ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-3.9/build.ninja b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-3.9/build.ninja new file mode 100644 index 00000000..ce664703 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-3.9/build.ninja @@ -0,0 +1,34 @@ +ninja_required_version = 1.3 +cxx = c++ +nvcc = /usr/local/cuda/bin/nvcc + +cflags = -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /ssd8/exec/huangjy/miniconda3/envs/vgt/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/include -fPIC -O2 -isystem /ssd8/exec/huangjy/miniconda3/envs/vgt/include -fPIC -DWITH_CUDA -I/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/TH -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/include/python3.9 -c +post_cflags = -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++14 +cuda_cflags = -DWITH_CUDA -I/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/TH -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/include/python3.9 -c +cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DCUDA_HAS_FP16=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 -ccbin g++ -std=c++14 +ldflags = + +rule compile + command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags + depfile = $out.d + deps = gcc + +rule cuda_compile + depfile = $out.d + deps = gcc + command = $nvcc $cuda_cflags -c $in -o $out $cuda_post_cflags + + + +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-3.9/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.cpp +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-3.9/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.cpp +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-3.9/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.cpp +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-3.9/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o: cuda_compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.cu +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-3.9/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o: cuda_compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.cu +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-3.9/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o: cuda_compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.cu +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-3.9/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.cpp + + + + + diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/.ninja_deps b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/.ninja_deps new file mode 100644 index 00000000..d4d080be Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/.ninja_deps differ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/.ninja_log b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/.ninja_log new file mode 100644 index 00000000..45c44099 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/.ninja_log @@ -0,0 +1,57 @@ +# ninja log v5 +1 2313 1710137265966278799 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o 22833fbca9713fe0 +1 2816 1710137266468282295 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o 2667d4baf8b5407c +1 3128 1710137266780284468 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o e8d861483d5a7377 +2 7548 1710137271198315232 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o e1e4b0044419bc08 +2 7644 1710137271294315901 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o da48a83253f181bc +2 7759 1710137271410316708 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o 35b5b9ec39df087 +2 14817 1710137278463365822 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o 1cbfc7ac6fc9c0f0 +22 2467 1710137281370386064 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o b7c1efd29df8f47f +22 2840 1710137281742388655 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o 5ada4b2f1d493b12 +22 2986 1710137281888389671 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o a2182f7e45c0b54 +22 6892 1710137285793416602 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o 13c07617accbab3a +22 7101 1710137286002418042 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o c2b91aa2571f2a +22 7212 1710137286113418807 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o 985c1e87cefa513c +22 13054 1710137291950459029 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o ba9527b6206ee297 +17 2206 1710224953594403051 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o 22833fbca9713fe0 +17 2933 1710224954320407881 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o 2667d4baf8b5407c +17 3048 1710224954435408646 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o e8d861483d5a7377 +18 6827 1710224958213433783 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o e1e4b0044419bc08 +18 7010 1710224958395434994 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o 35b5b9ec39df087 +18 7194 1710224958580436225 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o da48a83253f181bc +18 13336 1710224964718477065 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o 1cbfc7ac6fc9c0f0 +16 2403 1710224967513495661 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o b7c1efd29df8f47f +16 2656 1710224967765497338 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o 5ada4b2f1d493b12 +16 3000 1710224968108499620 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o a2182f7e45c0b54 +17 6981 1710224972089526108 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o c2b91aa2571f2a +17 7089 1710224972196526820 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o 13c07617accbab3a +16 7161 1710224972267527292 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o 985c1e87cefa513c +17 13132 1710224978234567010 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o ba9527b6206ee297 +36 2507 1710230192773252741 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o 22833fbca9713fe0 +36 3054 1710230193318256641 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o 2667d4baf8b5407c +36 3315 1710230193580258516 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o e8d861483d5a7377 +36 7396 1710230197659287706 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o 35b5b9ec39df087 +36 7458 1710230197721288149 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o e1e4b0044419bc08 +36 7904 1710230198168291348 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o da48a83253f181bc +36 14342 1710230204601337383 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o 1cbfc7ac6fc9c0f0 +23 2189 1710230207208356039 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o b7c1efd29df8f47f +23 2683 1710230207700359560 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o 5ada4b2f1d493b12 +23 3008 1710230208025361886 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o a2182f7e45c0b54 +23 6932 1710230211947389952 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o c2b91aa2571f2a +23 6960 1710230211977390167 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o 13c07617accbab3a +23 7106 1710230212122391204 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o 985c1e87cefa513c +24 14589 1710230219600444723 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o ba9527b6206ee297 +41 2415 1710474074089677356 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o 22833fbca9713fe0 +41 2915 1710474074588681031 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o 2667d4baf8b5407c +41 3266 1710474074939683616 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o e8d861483d5a7377 +42 7080 1710474078751711686 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o e1e4b0044419bc08 +42 7266 1710474078937713056 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o 35b5b9ec39df087 +42 7358 1710474079029713733 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o da48a83253f181bc +42 13563 1710474085230759396 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o 1cbfc7ac6fc9c0f0 +21 2154 1710474087804778350 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o b7c1efd29df8f47f +21 2945 1710474088594784168 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o 5ada4b2f1d493b12 +21 3037 1710474088686784845 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o a2182f7e45c0b54 +21 6778 1710474092426812386 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o c2b91aa2571f2a +21 7076 1710474092723814573 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o 13c07617accbab3a +21 7171 1710474092818815272 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o 985c1e87cefa513c +21 13186 1710474098828859528 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o ba9527b6206ee297 diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/build.ninja b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/build.ninja new file mode 100644 index 00000000..38c79261 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/build.ninja @@ -0,0 +1,32 @@ +ninja_required_version = 1.3 +cxx = c++ +nvcc = /usr/local/cuda/bin/nvcc + +cflags = -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -DWITH_CUDA -I/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/TH -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/THC -I/usr/local/cuda/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/include/python3.7m -c +post_cflags = -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++14 -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 +cuda_cflags = -DWITH_CUDA -I/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/TH -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/THC -I/usr/local/cuda/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/include/python3.7m -c +cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DCUDA_HAS_FP16=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_75,code=sm_75 -std=c++14 +ldflags = + +rule compile + command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags + depfile = $out.d + deps = gcc + +rule cuda_compile + command = $nvcc $cuda_cflags -c $in -o $out $cuda_post_cflags + + + +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.cpp +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.cpp +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.cpp +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o: cuda_compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.cu +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o: cuda_compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.cu +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o: cuda_compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.cu +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.cpp + + + + + diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o new file mode 100644 index 00000000..1f8ea657 Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o differ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o new file mode 100644 index 00000000..cf177c40 Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o differ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o new file mode 100644 index 00000000..a3a70b44 Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o differ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o new file mode 100644 index 00000000..de014f21 Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o differ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o new file mode 100644 index 00000000..f8b74ce4 Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o differ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o new file mode 100644 index 00000000..052038c4 Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o differ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o new file mode 100644 index 00000000..fe667a45 Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o differ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/dcn_v2.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/dcn_v2.py new file mode 100644 index 00000000..1b3ae6ec --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/dcn_v2.py @@ -0,0 +1,396 @@ +#!/usr/bin/env python +from __future__ import absolute_import, division, print_function + +import math + +import torch +from torch import nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +import _ext as _backend + + +class _DCNv2(Function): + @staticmethod + def forward( + ctx, input, offset, mask, weight, bias, stride, padding, dilation, deformable_groups + ): + ctx.stride = _pair(stride) + ctx.padding = _pair(padding) + ctx.dilation = _pair(dilation) + ctx.kernel_size = _pair(weight.shape[2:4]) + ctx.deformable_groups = deformable_groups + output = _backend.dcn_v2_forward( + input, + weight, + bias, + offset, + mask, + ctx.kernel_size[0], + ctx.kernel_size[1], + ctx.stride[0], + ctx.stride[1], + ctx.padding[0], + ctx.padding[1], + ctx.dilation[0], + ctx.dilation[1], + ctx.deformable_groups, + ) + ctx.save_for_backward(input, offset, mask, weight, bias) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input, offset, mask, weight, bias = ctx.saved_tensors + grad_input, grad_offset, grad_mask, grad_weight, grad_bias = _backend.dcn_v2_backward( + input, + weight, + bias, + offset, + mask, + grad_output, + ctx.kernel_size[0], + ctx.kernel_size[1], + ctx.stride[0], + ctx.stride[1], + ctx.padding[0], + ctx.padding[1], + ctx.dilation[0], + ctx.dilation[1], + ctx.deformable_groups, + ) + + return grad_input, grad_offset, grad_mask, grad_weight, grad_bias, None, None, None, None + + @staticmethod + def symbolic( + g, input, offset, mask, weight, bias, stride, padding, dilation, deformable_groups + ): + from torch.nn.modules.utils import _pair + + stride = _pair(stride) + padding = _pair(padding) + dilation = _pair(dilation) + # as of trt 7, the dcn operation will be translated again by modifying the onnx file + # so the exporting code is kept to resemble the forward() + return g.op( + "ai.onnx.contrib::_DCNv2_2", + input, + offset, + mask, + weight, + bias, + stride_i=stride, + padding_i=padding, + dilation_i=dilation, + deformable_groups_i=deformable_groups, + ) + + +dcn_v2_conv = _DCNv2.apply + + +class DCNv2(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation=1, + deformable_groups=1, + ): + super(DCNv2, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _pair(kernel_size) + self.stride = _pair(stride) + self.padding = _pair(padding) + self.dilation = _pair(dilation) + self.deformable_groups = deformable_groups + + self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels, *self.kernel_size)) + self.bias = nn.Parameter(torch.Tensor(out_channels)) + self.reset_parameters() + + def reset_parameters(self): + n = self.in_channels + for k in self.kernel_size: + n *= k + stdv = 1.0 / math.sqrt(n) + self.weight.data.uniform_(-stdv, stdv) + self.bias.data.zero_() + + def forward(self, input, offset, mask): + assert ( + 2 * self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] + == offset.shape[1] + ) + assert self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == mask.shape[1] + return dcn_v2_conv( + input, + offset, + mask, + self.weight, + self.bias, + self.stride, + self.padding, + self.dilation, + self.deformable_groups, + ) + + +class DCN(DCNv2): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation=1, + deformable_groups=1, + ): + super(DCN, self).__init__( + in_channels, out_channels, kernel_size, stride, padding, dilation, deformable_groups + ) + + channels_ = self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1] + self.conv_offset_mask = nn.Conv2d( + self.in_channels, + channels_, + kernel_size=self.kernel_size, + stride=self.stride, + padding=self.padding, + bias=True, + ) + self.init_offset() + + def init_offset(self): + self.conv_offset_mask.weight.data.zero_() + self.conv_offset_mask.bias.data.zero_() + + def forward(self, input): + out = self.conv_offset_mask(input) + o1, o2, mask = torch.chunk(out, 3, dim=1) + offset = torch.cat((o1, o2), dim=1) + mask = torch.sigmoid(mask) + return dcn_v2_conv( + input, + offset, + mask, + self.weight, + self.bias, + self.stride, + self.padding, + self.dilation, + self.deformable_groups, + ) + + +class _DCNv2Pooling(Function): + @staticmethod + def forward( + ctx, + input, + rois, + offset, + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=0.0, + ): + ctx.spatial_scale = spatial_scale + ctx.no_trans = int(no_trans) + ctx.output_dim = output_dim + ctx.group_size = group_size + ctx.pooled_size = pooled_size + ctx.part_size = pooled_size if part_size is None else part_size + ctx.sample_per_part = sample_per_part + ctx.trans_std = trans_std + + output, output_count = _backend.dcn_v2_psroi_pooling_forward( + input, + rois, + offset, + ctx.no_trans, + ctx.spatial_scale, + ctx.output_dim, + ctx.group_size, + ctx.pooled_size, + ctx.part_size, + ctx.sample_per_part, + ctx.trans_std, + ) + ctx.save_for_backward(input, rois, offset, output_count) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input, rois, offset, output_count = ctx.saved_tensors + grad_input, grad_offset = _backend.dcn_v2_psroi_pooling_backward( + grad_output, + input, + rois, + offset, + output_count, + ctx.no_trans, + ctx.spatial_scale, + ctx.output_dim, + ctx.group_size, + ctx.pooled_size, + ctx.part_size, + ctx.sample_per_part, + ctx.trans_std, + ) + + return grad_input, None, grad_offset, None, None, None, None, None, None, None, None + + +dcn_v2_pooling = _DCNv2Pooling.apply + + +class DCNv2Pooling(nn.Module): + def __init__( + self, + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=0.0, + ): + super(DCNv2Pooling, self).__init__() + self.spatial_scale = spatial_scale + self.pooled_size = pooled_size + self.output_dim = output_dim + self.no_trans = no_trans + self.group_size = group_size + self.part_size = pooled_size if part_size is None else part_size + self.sample_per_part = sample_per_part + self.trans_std = trans_std + + def forward(self, input, rois, offset): + assert input.shape[1] == self.output_dim + if self.no_trans: + offset = input.new() + return dcn_v2_pooling( + input, + rois, + offset, + self.spatial_scale, + self.pooled_size, + self.output_dim, + self.no_trans, + self.group_size, + self.part_size, + self.sample_per_part, + self.trans_std, + ) + + +class DCNPooling(DCNv2Pooling): + def __init__( + self, + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=0.0, + deform_fc_dim=1024, + ): + super(DCNPooling, self).__init__( + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size, + part_size, + sample_per_part, + trans_std, + ) + + self.deform_fc_dim = deform_fc_dim + + if not no_trans: + self.offset_mask_fc = nn.Sequential( + nn.Linear( + self.pooled_size * self.pooled_size * self.output_dim, self.deform_fc_dim + ), + nn.ReLU(inplace=True), + nn.Linear(self.deform_fc_dim, self.deform_fc_dim), + nn.ReLU(inplace=True), + nn.Linear(self.deform_fc_dim, self.pooled_size * self.pooled_size * 3), + ) + self.offset_mask_fc[4].weight.data.zero_() + self.offset_mask_fc[4].bias.data.zero_() + + def forward(self, input, rois): + offset = input.new() + + if not self.no_trans: + + # do roi_align first + n = rois.shape[0] + roi = dcn_v2_pooling( + input, + rois, + offset, + self.spatial_scale, + self.pooled_size, + self.output_dim, + True, # no trans + self.group_size, + self.part_size, + self.sample_per_part, + self.trans_std, + ) + + # build mask and offset + offset_mask = self.offset_mask_fc(roi.view(n, -1)) + offset_mask = offset_mask.view(n, 3, self.pooled_size, self.pooled_size) + o1, o2, mask = torch.chunk(offset_mask, 3, dim=1) + offset = torch.cat((o1, o2), dim=1) + mask = torch.sigmoid(mask) + + # do pooling with offset and mask + return ( + dcn_v2_pooling( + input, + rois, + offset, + self.spatial_scale, + self.pooled_size, + self.output_dim, + self.no_trans, + self.group_size, + self.part_size, + self.sample_per_part, + self.trans_std, + ) + * mask + ) + # only roi_align + return dcn_v2_pooling(input, rois, offset, + self.spatial_scale, + self.pooled_size, + self.output_dim, + self.no_trans, + self.group_size, + self.part_size, + self.sample_per_part, + self.trans_std) diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/make.sh b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/make.sh new file mode 100755 index 00000000..55f5587e --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/make.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash +sudo rm *.so +sudo rm -r build/ +sudo python3 setup.py build develop diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/setup.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/setup.py new file mode 100644 index 00000000..ad9814a4 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/setup.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python + +import glob +import os + +import torch +from setuptools import find_packages, setup +from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension + +requirements = ["torch", "torchvision"] + + +def get_extensions(): + this_dir = os.path.dirname(os.path.abspath(__file__)) + extensions_dir = os.path.join(this_dir, "src") + + main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) + source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) + source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) + os.environ["CC"] = "g++" + sources = main_file + source_cpu + extension = CppExtension + extra_compile_args = {"cxx": []} + define_macros = [] + + + if torch.cuda.is_available() and CUDA_HOME is not None: + extension = CUDAExtension + sources += source_cuda + define_macros += [("WITH_CUDA", None)] + extra_compile_args["nvcc"] = [ + "-DCUDA_HAS_FP16=1", + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + ] + else: + # raise NotImplementedError('Cuda is not available') + pass + + sources = [os.path.join(extensions_dir, s) for s in sources] + include_dirs = [extensions_dir] + ext_modules = [ + extension( + "_ext", + sources, + include_dirs=include_dirs, + define_macros=define_macros, + extra_compile_args=extra_compile_args, + ) + ] + return ext_modules + + +setup( + name="DCNv2", + version="0.1", + author="charlesshang", + url="https://github.com/charlesshang/DCNv2", + description="deformable convolutional networks", + packages=find_packages(exclude=("configs", "tests")), + # install_requires=requirements, + ext_modules=get_extensions(), + cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, +) diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.cpp b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.cpp new file mode 100644 index 00000000..76c65f01 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.cpp @@ -0,0 +1,233 @@ +#include +#include "cpu/dcn_v2_im2col_cpu.h" + +#include +//#include + +#include +//#include +//#include + +//extern THCState *state; + +// author: Charles Shang +// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu +// modified from the CUDA version for CPU use by Daniel K. Suhendro + +at::Tensor +dcn_v2_cpu_forward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int deformable_group) +{ + // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask)); + /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); + AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); + AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); + AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");*/ + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + + const int channels_out = weight.size(0); + const int channels_kernel = weight.size(1); + const int kernel_h_ = weight.size(2); + const int kernel_w_ = weight.size(3); + + // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h); + // printf("Channels: %d %d\n", channels, channels_kernel); + // printf("Channels: %d %d\n", channels_out, channels_kernel); + + AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, + "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); + + AT_ASSERTM(channels == channels_kernel, + "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); + + const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + + auto ones = at::ones({height_out, width_out}, input.options()); + auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); + auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); + + using scalar_t = float; + for (int b = 0; b < batch; b++) + { + auto input_n = input.select(0, b); + auto offset_n = offset.select(0, b); + auto mask_n = mask.select(0, b); + auto output_n = output.select(0, b); + + // Do Bias first: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + // (N x 1) (1 x M) + long m_ = channels_out; + long n_ = height_out * width_out; + long k_ = 1; + THFloatBlas_gemm('t', 'n', n_, m_, k_, 1.0f, + ones.contiguous().data(), k_, + bias.contiguous().data(), k_, 0.0f, + output_n.data(), n_); + + modulated_deformable_im2col_cpu(input_n.data(), + offset_n.data(), + mask_n.data(), + 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, + deformable_group, + columns.data()); + + //(k * m) x (m * n) + // Y = WC + long m = channels_out; + long n = height_out * width_out; + long k = channels * kernel_h * kernel_w; + THFloatBlas_gemm('n', 'n', n, m, k, 1.0f, + columns.data(), n, + weight.data(), k, 1.0f, + output_n.data(), n); + } + return output; +} + +std::vector dcn_v2_cpu_backward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const at::Tensor &grad_output, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int deformable_group) +{ + + THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous"); + THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous"); + + /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); + AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); + AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); + AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");*/ + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + + const int channels_out = weight.size(0); + const int channels_kernel = weight.size(1); + const int kernel_h_ = weight.size(2); + const int kernel_w_ = weight.size(3); + + AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, + "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); + + AT_ASSERTM(channels == channels_kernel, + "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); + + const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + + auto ones = at::ones({height_out, width_out}, input.options()); + auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); + auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); + + auto grad_input = at::zeros_like(input); + auto grad_weight = at::zeros_like(weight); + auto grad_bias = at::zeros_like(bias); + auto grad_offset = at::zeros_like(offset); + auto grad_mask = at::zeros_like(mask); + + using scalar_t = float; + + for (int b = 0; b < batch; b++) + { + auto input_n = input.select(0, b); + auto offset_n = offset.select(0, b); + auto mask_n = mask.select(0, b); + auto grad_output_n = grad_output.select(0, b); + auto grad_input_n = grad_input.select(0, b); + auto grad_offset_n = grad_offset.select(0, b); + auto grad_mask_n = grad_mask.select(0, b); + + long m = channels * kernel_h * kernel_w; + long n = height_out * width_out; + long k = channels_out; + + THFloatBlas_gemm('n', 't', n, m, k, 1.0f, + grad_output_n.data(), n, + weight.data(), m, 0.0f, + columns.data(), n); + + // gradient w.r.t. input coordinate data + modulated_deformable_col2im_coord_cpu(columns.data(), + input_n.data(), + offset_n.data(), + mask_n.data(), + 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, + grad_offset_n.data(), + grad_mask_n.data()); + // gradient w.r.t. input data + modulated_deformable_col2im_cpu(columns.data(), + offset_n.data(), + mask_n.data(), + 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, + grad_input_n.data()); + + // gradient w.r.t. weight, dWeight should accumulate across the batch and group + modulated_deformable_im2col_cpu(input_n.data(), + offset_n.data(), + mask_n.data(), + 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, + columns.data()); + + long m_ = channels_out; + long n_ = channels * kernel_h * kernel_w; + long k_ = height_out * width_out; + + THFloatBlas_gemm('t', 'n', n_, m_, k_, 1.0f, + columns.data(), k_, + grad_output_n.data(), k_, 1.0f, + grad_weight.data(), n_); + + // gradient w.r.t. bias + // long m_ = channels_out; + // long k__ = height_out * width_out; + // THFloatBlas_gemv('t', k_, m_, 1.0f, + // grad_output_n.data(), k_, + // ones.data(), 1, 1.0f, + // grad_bias.data(), 1); + } + + return { + grad_input, grad_offset, grad_mask, grad_weight, grad_bias + }; +} \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.cpp b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.cpp new file mode 100644 index 00000000..1704a60d --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.cpp @@ -0,0 +1,395 @@ +#include "dcn_v2_im2col_cpu.h" +#include +#include +#include + +#include +//#include + +#include +//#include +//#include + +// modified from the CUDA version for CPU use by Daniel K. Suhendro + +/*#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N) +{ + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +}*/ + + +float dmcn_im2col_bilinear_cpu(const float *bottom_data, const int data_width, + const int height, const int width, float h, float w) +{ + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h - h_low; + float lw = w - w_low; + float hh = 1 - lh, hw = 1 - lw; + + float v1 = 0; + if (h_low >= 0 && w_low >= 0) + v1 = bottom_data[h_low * data_width + w_low]; + float v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + v2 = bottom_data[h_low * data_width + w_high]; + float v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + v3 = bottom_data[h_high * data_width + w_low]; + float v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + v4 = bottom_data[h_high * data_width + w_high]; + + float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +float dmcn_get_gradient_weight_cpu(float argmax_h, float argmax_w, + const int h, const int w, const int height, const int width) +{ + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + float weight = 0; + if (h == argmax_h_low && w == argmax_w_low) + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + if (h == argmax_h_low && w == argmax_w_high) + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + if (h == argmax_h_high && w == argmax_w_low) + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + if (h == argmax_h_high && w == argmax_w_high) + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + return weight; +} + +float dmcn_get_coordinate_weight_cpu(float argmax_h, float argmax_w, + const int height, const int width, const float *im_data, + const int data_width, const int bp_dir) +{ + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + float weight = 0; + + if (bp_dir == 0) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + else if (bp_dir == 1) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + + return weight; +} + +void modulated_deformable_im2col_cpu_kernel(const int n, const float *data_im, const float *data_offset, const float *data_mask, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int num_channels, const int deformable_group, + const int height_col, const int width_col, + float *data_col) +{ + // launch channels * batch_size * height_col * width_col cores + for(int index=0; index(0); + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) + { + //const float map_h = i * dilation_h + offset_h; + //const float map_w = j * dilation_w + offset_w; + //const int cur_height = height - h_in; + //const int cur_width = width - w_in; + //val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, cur_height, cur_width, map_h, map_w); + val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val * mask; + // data_col_ptr += batch_size * height_col * width_col; + data_col_ptr += height_col * width_col; + } + } + } +} + +void modulated_deformable_col2im_cpu_kernel(const int n, const float *data_col, const float *data_offset, const float *data_mask, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int deformable_group, + const int height_col, const int width_col, + float *grad_im) +{ + for(int index = 0; index < n; index++) + { + const int j = (index / width_col / height_col / batch_size) % kernel_w; + const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; + // compute the start and end of the output + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = index % width_col; + int h_out = (index / width_col) % height_col; + int b = (index / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; + const float offset_h = data_offset_ptr[data_offset_h_ptr]; + const float offset_w = data_offset_ptr[data_offset_w_ptr]; + const float mask = data_mask_ptr[data_mask_hw_ptr]; + const float cur_inv_h_data = h_in + i * dilation_h + offset_h; + const float cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const float cur_top_grad = data_col[index] * mask; + const int cur_h = (int)cur_inv_h_data; + const int cur_w = (int)cur_inv_w_data; + + for (int dy = -2; dy <= 2; dy++) + { + for (int dx = -2; dx <= 2; dx++) + { + if (cur_h + dy >= 0 && cur_h + dy < height && + cur_w + dx >= 0 && cur_w + dx < width && + abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) + { + int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + float weight = dmcn_get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); + //atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad; + + } + } + } + } +} + +void modulated_deformable_col2im_coord_cpu_kernel(const int n, const float *data_col, const float *data_im, + const float *data_offset, const float *data_mask, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int offset_channels, const int deformable_group, + const int height_col, const int width_col, + float *grad_offset, float *grad_mask) +{ + for(int index = 0; index < n; index++) + { + float val = 0, mval = 0; + int w = index % width_col; + int h = (index / width_col) % height_col; + int c = (index / width_col / height_col) % offset_channels; + int b = (index / width_col / height_col) / offset_channels; + // compute the start and end of the output + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; + const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; + const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) + { + const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); + const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); + const float offset_h = data_offset_ptr[data_offset_h_ptr]; + const float offset_w = data_offset_ptr[data_offset_w_ptr]; + const float mask = data_mask_ptr[data_mask_hw_ptr]; + float inv_h = h_in + i * dilation_h + offset_h; + float inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) + { + inv_h = inv_w = -2; + } + else + { + mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear_cpu(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w); + } + const float weight = dmcn_get_coordinate_weight_cpu( + inv_h, inv_w, + height, width, data_im_ptr + cnt * height * width, width, bp_dir); + val += weight * data_col_ptr[col_pos] * mask; + cnt += 1; + } + // KERNEL_ASSIGN(grad_offset[index], offset_req, val); + grad_offset[index] = val; + if (offset_c % 2 == 0) + // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval); + grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval; + } +} + +void modulated_deformable_im2col_cpu(const float* data_im, const float* data_offset, const float* data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float* data_col) { + // num_axes should be smaller than block size + const int channel_per_deformable_group = channels / deformable_group; + const int num_kernels = channels * batch_size * height_col * width_col; + modulated_deformable_im2col_cpu_kernel( + num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, + batch_size, channels, deformable_group, height_col, width_col, data_col); + + /*cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); + }*/ + +} + +void modulated_deformable_col2im_cpu(const float* data_col, const float* data_offset, const float* data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float* grad_im){ + + const int channel_per_deformable_group = channels / deformable_group; + const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col; + modulated_deformable_col2im_cpu_kernel( + num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im, + kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + batch_size, deformable_group, height_col, width_col, grad_im); + /*cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); + }*/ + +} + +void modulated_deformable_col2im_coord_cpu(const float* data_col, const float* data_im, const float* data_offset, const float* data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, + float* grad_offset, float* grad_mask) { + const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group; + const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group; + modulated_deformable_col2im_coord_cpu_kernel( + num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im, + kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, + grad_offset, grad_mask); + /*cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err)); + }*/ +} \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.h b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.h new file mode 100644 index 00000000..bad5c528 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.h @@ -0,0 +1,99 @@ + +/*! + ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** + * + * COPYRIGHT + * + * All contributions by the University of California: + * Copyright (c) 2014-2017 The Regents of the University of California (Regents) + * All rights reserved. + * + * All other contributions: + * Copyright (c) 2014-2017, the respective contributors + * All rights reserved. + * + * Caffe uses a shared copyright model: each contributor holds copyright over + * their contributions to Caffe. The project versioning records all such + * contribution and copyright details. If a contributor wants to further mark + * their specific copyright on a particular contribution, they should indicate + * their copyright solely in the commit message of the change when it is + * committed. + * + * LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * CONTRIBUTION AGREEMENT + * + * By contributing to the BVLC/caffe repository through pull-request, comment, + * or otherwise, the contributor releases their content to the + * license and copyright terms herein. + * + ***************** END Caffe Copyright Notice and Disclaimer ******************** + * + * Copyright (c) 2018 Microsoft + * Licensed under The MIT License [see LICENSE for details] + * \file modulated_deformable_im2col.h + * \brief Function definitions of converting an image to + * column matrix based on kernel, padding, dilation, and offset. + * These functions are mainly used in deformable convolution operators. + * \ref: https://arxiv.org/abs/1811.11168 + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu + */ + +/***************** Adapted by Charles Shang *********************/ +// modified from the CUDA version for CPU use by Daniel K. Suhendro + +#ifndef DCN_V2_IM2COL_CPU +#define DCN_V2_IM2COL_CPU + +#ifdef __cplusplus +extern "C" +{ +#endif + + void modulated_deformable_im2col_cpu(const float *data_im, const float *data_offset, const float *data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float *data_col); + + void modulated_deformable_col2im_cpu(const float *data_col, const float *data_offset, const float *data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float *grad_im); + + void modulated_deformable_col2im_coord_cpu(const float *data_col, const float *data_im, const float *data_offset, const float *data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, + float *grad_offset, float *grad_mask); + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.cpp b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.cpp new file mode 100644 index 00000000..6e41aaed --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.cpp @@ -0,0 +1,426 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The MIT License [see LICENSE for details] + * \file deformable_psroi_pooling.cu + * \brief + * \author Yi Li, Guodong Zhang, Jifeng Dai +*/ +/***************** Adapted by Charles Shang *********************/ +// modified from the CUDA version for CPU use by Daniel K. Suhendro + +#include +#include +#include + +#include +//#include + +#include +//#include +//#include + +/*#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N) +{ + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +}*/ + +template +T bilinear_interp_cpu( + const T *data, + const T x, + const T y, + const int width, + const int height) +{ + int x1 = floor(x); + int x2 = ceil(x); + int y1 = floor(y); + int y2 = ceil(y); + T dist_x = static_cast(x - x1); + T dist_y = static_cast(y - y1); + T value11 = data[y1 * width + x1]; + T value12 = data[y2 * width + x1]; + T value21 = data[y1 * width + x2]; + T value22 = data[y2 * width + x2]; + T value = (1 - dist_x) * (1 - dist_y) * value11 + + (1 - dist_x) * dist_y * value12 + + dist_x * (1 - dist_y) * value21 + + dist_x * dist_y * value22; + return value; +} + +template + void DeformablePSROIPoolForwardKernelCpu( + const int count, + const T *bottom_data, + const T spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const T *bottom_rois, const T *bottom_trans, + const int no_trans, + const T trans_std, + const int sample_per_part, + const int output_dim, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class, + T *top_data, + T *top_count) +{ + for(int index = 0; index < count; index++) + { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const T *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + T roi_width = std::max(roi_end_w - roi_start_w, T(0.1)); //avoid 0 + T roi_height = std::max(roi_end_h - roi_start_h, T(0.1)); + + // Compute w and h at bottom + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + int part_h = floor(static_cast(ph) / pooled_height * part_size); + int part_w = floor(static_cast(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; + T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; + + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + T sum = 0; + int count = 0; + int gw = floor(static_cast(pw) * group_size / pooled_width); + int gh = floor(static_cast(ph) * group_size / pooled_height); + gw = std::min(std::max(gw, 0), group_size - 1); + gh = std::min(std::max(gh, 0), group_size - 1); + + const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width; + for (int ih = 0; ih < sample_per_part; ih++) + { + for (int iw = 0; iw < sample_per_part; iw++) + { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) + { + continue; + } + w = std::min(std::max(w, T(0.)), width - T(1.)); + h = std::min(std::max(h, T(0.)), height - T(1.)); + int c = (ctop * group_size + gh) * group_size + gw; + T val = bilinear_interp_cpu(offset_bottom_data + c * height * width, w, h, width, height); + sum += val; + count++; + } + } + top_data[index] = count == 0 ? static_cast(0) : sum / count; + top_count[index] = count; + } +} + +template +void DeformablePSROIPoolBackwardAccKernelCpu( + const int count, + const T *top_diff, + const T *top_count, + const int num_rois, + const T spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int output_dim, + T *bottom_data_diff, T *bottom_trans_diff, + const T *bottom_data, + const T *bottom_rois, + const T *bottom_trans, + const int no_trans, + const T trans_std, + const int sample_per_part, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class) +{ + for(int index = 0; index < count; index++) + { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const T *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + T roi_width = std::max(roi_end_w - roi_start_w, T(0.1)); //avoid 0 + T roi_height = std::max(roi_end_h - roi_start_h, T(0.1)); + + // Compute w and h at bottom + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + int part_h = floor(static_cast(ph) / pooled_height * part_size); + int part_w = floor(static_cast(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; + T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; + + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + if (top_count[index] <= 0) + { + continue; + } + T diff_val = top_diff[index] / top_count[index]; + const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; + T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width; + int gw = floor(static_cast(pw) * group_size / pooled_width); + int gh = floor(static_cast(ph) * group_size / pooled_height); + gw = std::min(std::max(gw, 0), group_size - 1); + gh = std::min(std::max(gh, 0), group_size - 1); + + for (int ih = 0; ih < sample_per_part; ih++) + { + for (int iw = 0; iw < sample_per_part; iw++) + { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) + { + continue; + } + w = std::min(std::max(w, T(0.)), width - T(1.)); + h = std::min(std::max(h, T(0.)), height - T(1.)); + int c = (ctop * group_size + gh) * group_size + gw; + // backward on feature + int x0 = floor(w); + int x1 = ceil(w); + int y0 = floor(h); + int y1 = ceil(h); + T dist_x = w - x0, dist_y = h - y0; + T q00 = (1 - dist_x) * (1 - dist_y); + T q01 = (1 - dist_x) * dist_y; + T q10 = dist_x * (1 - dist_y); + T q11 = dist_x * dist_y; + int bottom_index_base = c * height * width; + /*atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);*/ + *(offset_bottom_data_diff + bottom_index_base + y0 * width + x0) += q00 * diff_val; + *(offset_bottom_data_diff + bottom_index_base + y1 * width + x0) += q01 * diff_val; + *(offset_bottom_data_diff + bottom_index_base + y0 * width + x1) += q10 * diff_val; + *(offset_bottom_data_diff + bottom_index_base + y1 * width + x1) += q11 * diff_val; + + + if (no_trans) + { + continue; + } + T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; + T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; + T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; + T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; + T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val; + diff_x *= roi_width; + T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val; + diff_y *= roi_height; + + /*atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x); + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);*/ + *(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w) += diff_x; + *(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w) += diff_y; + } + } + } +} + +std::tuple +dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor"); + AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");*/ + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + const int channels_trans = no_trans ? 2 : trans.size(1); + const int num_bbox = bbox.size(0); + + AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); + auto pooled_height = pooled_size; + auto pooled_width = pooled_size; + + auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); + long out_size = num_bbox * output_dim * pooled_height * pooled_width; + auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); + + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + //cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + if (out.numel() == 0) + { + //THCudaCheck(cudaGetLastError()); + return std::make_tuple(out, top_count); + } + + /*dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); + dim3 block(512);*/ + + AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cpu_forward", [&] { + DeformablePSROIPoolForwardKernelCpu( + out_size, + input.contiguous().data(), + spatial_scale, + channels, + height, width, + pooled_height, + pooled_width, + bbox.contiguous().data(), + trans.contiguous().data(), + no_trans, + trans_std, + sample_per_part, + output_dim, + group_size, + part_size, + num_classes, + channels_each_class, + out.data(), + top_count.data()); + }); + //THCudaCheck(cudaGetLastError()); + return std::make_tuple(out, top_count); +} + +std::tuple +dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad, + const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const at::Tensor &top_count, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + /*AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor"); + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor"); + AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor"); + AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor");*/ + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + const int channels_trans = no_trans ? 2 : trans.size(1); + const int num_bbox = bbox.size(0); + + AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); + auto pooled_height = pooled_size; + auto pooled_width = pooled_size; + long out_size = num_bbox * output_dim * pooled_height * pooled_width; + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options()); + auto trans_grad = at::zeros_like(trans); + + if (input_grad.numel() == 0) + { + //THCudaCheck(cudaGetLastError()); + return std::make_tuple(input_grad, trans_grad); + } + + /*dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); + dim3 block(512); + cudaStream_t stream = at::cuda::getCurrentCUDAStream();*/ + + AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cpu_backward", [&] { + DeformablePSROIPoolBackwardAccKernelCpu( + out_size, + out_grad.contiguous().data(), + top_count.contiguous().data(), + num_bbox, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + output_dim, + input_grad.contiguous().data(), + trans_grad.contiguous().data(), + input.contiguous().data(), + bbox.contiguous().data(), + trans.contiguous().data(), + no_trans, + trans_std, + sample_per_part, + group_size, + part_size, + num_classes, + channels_each_class); + }); + //THCudaCheck(cudaGetLastError()); + return std::make_tuple(input_grad, trans_grad); +} \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/vision.h b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/vision.h new file mode 100644 index 00000000..d5fbf1f0 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/vision.h @@ -0,0 +1,60 @@ +#pragma once +#include + +at::Tensor +dcn_v2_cpu_forward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int deformable_group); + +std::vector +dcn_v2_cpu_backward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const at::Tensor &grad_output, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int deformable_group); + + +std::tuple +dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std); + +std::tuple +dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad, + const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const at::Tensor &top_count, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std); \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.cu b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.cu new file mode 100644 index 00000000..cef3068f --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.cu @@ -0,0 +1,372 @@ +#include +#include "cuda/dcn_v2_im2col_cuda.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +THCState *state = at::globalContext().lazyInitCUDA(); + +static cublasOperation_t _cublasOpFromChar(char op) { + switch (op) { + case 'n': + case 'N': + return CUBLAS_OP_N; + case 't': + case 'T': + return CUBLAS_OP_T; + case 'c': + case 'C': + return CUBLAS_OP_C; + } + AT_ERROR( + "_cublasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`"); + } + + static void _cublasAdjustLdLevel2(int64_t m, int64_t n, int64_t* lda) { + // Note: leading dimensions generally are checked that they are > 0 + // and at least as big the result requires (even if the value won't + // be used). + + // Q: Why does Level3 check trans but this doesn't? + // A: In level 2, the sizes (m, n) specify the size of A + // (independent of trans value). In level 3. the sizes (m, n, k) + // specify the sizes of op(A), op(B) where op depend on trans + // values. + if (n <= 1) + *lda = std::max(m, 1); + } + + + +// author: Charles Shang +// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu + +// [batch gemm] +// https://github.com/pytorch/pytorch/blob/master/aten/src/THC/generic/THCTensorMathBlas.cu + +__global__ void createBatchGemmBuffer(const float **input_b, float **output_b, + float **columns_b, const float **ones_b, + const float **weight_b, const float **bias_b, + float *input, float *output, + float *columns, float *ones, + float *weight, float *bias, + const int input_stride, const int output_stride, + const int columns_stride, const int ones_stride, + const int num_batches) +{ + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_batches) + { + input_b[idx] = input + idx * input_stride; + output_b[idx] = output + idx * output_stride; + columns_b[idx] = columns + idx * columns_stride; + ones_b[idx] = ones + idx * ones_stride; + // share weights and bias within a Mini-Batch + weight_b[idx] = weight; + bias_b[idx] = bias; + } +} + +at::Tensor +dcn_v2_cuda_forward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int deformable_group) +{ + using scalar_t = float; + // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask)); + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); + AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); + AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); + AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + + const int channels_out = weight.size(0); + const int channels_kernel = weight.size(1); + const int kernel_h_ = weight.size(2); + const int kernel_w_ = weight.size(3); + + // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h); + // printf("Channels: %d %d\n", channels, channels_kernel); + // printf("Channels: %d %d\n", channels_out, channels_kernel); + + AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, + "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); + + AT_ASSERTM(channels == channels_kernel, + "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); + + const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + + auto ones = at::ones({batch, height_out, width_out}, input.options()); + auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); + auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); + + // prepare for batch-wise computing, which is significantly faster than instance-wise computing + // when batch size is large. + // launch batch threads + int matrices_size = batch * sizeof(float *); + auto input_b = static_cast(THCudaMalloc(state, matrices_size)); + auto output_b = static_cast(THCudaMalloc(state, matrices_size)); + auto columns_b = static_cast(THCudaMalloc(state, matrices_size)); + auto ones_b = static_cast(THCudaMalloc(state, matrices_size)); + auto weight_b = static_cast(THCudaMalloc(state, matrices_size)); + auto bias_b = static_cast(THCudaMalloc(state, matrices_size)); + + const int block = 128; + const int grid = (batch + block - 1) / block; + + createBatchGemmBuffer<<>>( + input_b, output_b, + columns_b, ones_b, + weight_b, bias_b, + input.data_ptr(), + output.data_ptr(), + columns.data_ptr(), + ones.data_ptr(), + weight.data_ptr(), + bias.data_ptr(), + channels * width * height, + channels_out * width_out * height_out, + channels * kernel_h * kernel_w * height_out * width_out, + height_out * width_out, + batch); + + long m_ = channels_out; + long n_ = height_out * width_out; + long k_ = 1; + THCudaBlas_SgemmBatched(state, + 't', + 'n', + n_, + m_, + k_, + 1.0f, + ones_b, k_, + bias_b, k_, + 0.0f, + output_b, n_, + batch); + + modulated_deformable_im2col_cuda(c10::cuda::getCurrentCUDAStream(), + input.data_ptr(), + offset.data_ptr(), + mask.data_ptr(), + batch, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, + deformable_group, + columns.data_ptr()); + + long m = channels_out; + long n = height_out * width_out; + long k = channels * kernel_h * kernel_w; + THCudaBlas_SgemmBatched(state, + 'n', + 'n', + n, + m, + k, + 1.0f, + (const float **)columns_b, n, + weight_b, k, + 1.0f, + output_b, n, + batch); + + THCudaFree(state, input_b); + THCudaFree(state, output_b); + THCudaFree(state, columns_b); + THCudaFree(state, ones_b); + THCudaFree(state, weight_b); + THCudaFree(state, bias_b); + return output; +} + +__global__ void createBatchGemmBufferBackward( + float **grad_output_b, + float **columns_b, + float **ones_b, + float **weight_b, + float **grad_weight_b, + float **grad_bias_b, + float *grad_output, + float *columns, + float *ones, + float *weight, + float *grad_weight, + float *grad_bias, + const int grad_output_stride, + const int columns_stride, + const int ones_stride, + const int num_batches) +{ + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_batches) + { + grad_output_b[idx] = grad_output + idx * grad_output_stride; + columns_b[idx] = columns + idx * columns_stride; + ones_b[idx] = ones + idx * ones_stride; + + // share weights and bias within a Mini-Batch + weight_b[idx] = weight; + grad_weight_b[idx] = grad_weight; + grad_bias_b[idx] = grad_bias; + } +} + +std::vector dcn_v2_cuda_backward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const at::Tensor &grad_output, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int deformable_group) +{ + + THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous"); + THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous"); + + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); + AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); + AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); + AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + + const int channels_out = weight.size(0); + const int channels_kernel = weight.size(1); + const int kernel_h_ = weight.size(2); + const int kernel_w_ = weight.size(3); + + AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, + "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); + + AT_ASSERTM(channels == channels_kernel, + "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); + + const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + + auto ones = at::ones({height_out, width_out}, input.options()); + auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); + auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); + + auto grad_input = at::zeros_like(input); + auto grad_weight = at::zeros_like(weight); + auto grad_bias = at::zeros_like(bias); + auto grad_offset = at::zeros_like(offset); + auto grad_mask = at::zeros_like(mask); + + using scalar_t = float; + + for (int b = 0; b < batch; b++) + { + auto input_n = input.select(0, b); + auto offset_n = offset.select(0, b); + auto mask_n = mask.select(0, b); + auto grad_output_n = grad_output.select(0, b); + auto grad_input_n = grad_input.select(0, b); + auto grad_offset_n = grad_offset.select(0, b); + auto grad_mask_n = grad_mask.select(0, b); + + long m = channels * kernel_h * kernel_w; + long n = height_out * width_out; + long k = channels_out; + + THCudaBlas_Sgemm(state, 'n', 't', n, m, k, 1.0f, + grad_output_n.data_ptr(), n, + weight.data_ptr(), m, 0.0f, + columns.data_ptr(), n); + + // gradient w.r.t. input coordinate data + modulated_deformable_col2im_coord_cuda(c10::cuda::getCurrentCUDAStream(), + columns.data_ptr(), + input_n.data_ptr(), + offset_n.data_ptr(), + mask_n.data_ptr(), + 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, + grad_offset_n.data_ptr(), + grad_mask_n.data_ptr()); + // gradient w.r.t. input data + modulated_deformable_col2im_cuda(c10::cuda::getCurrentCUDAStream(), + columns.data_ptr(), + offset_n.data_ptr(), + mask_n.data_ptr(), + 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, + grad_input_n.data_ptr()); + + // gradient w.r.t. weight, dWeight should accumulate across the batch and group + modulated_deformable_im2col_cuda(c10::cuda::getCurrentCUDAStream(), + input_n.data_ptr(), + offset_n.data_ptr(), + mask_n.data_ptr(), + 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, + columns.data_ptr()); + + long m_ = channels_out; + long n_ = channels * kernel_h * kernel_w; + long k_ = height_out * width_out; + + THCudaBlas_Sgemm(state, 't', 'n', n_, m_, k_, 1.0f, + columns.data_ptr(), k_, + grad_output_n.data_ptr(), k_, 1.0f, + grad_weight.data_ptr(), n_); + + cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); + cublasOperation_t op = _cublasOpFromChar('t'); + _cublasAdjustLdLevel2(k_, m_, &k_); + scalar_t* grad_output_n_float = grad_output_n.data_ptr(); + scalar_t* one_float = ones.data_ptr(); + scalar_t alpha = 1.0; + scalar_t beta = 1.0; + cublasSgemv(handle, op, k_, m_, &alpha, grad_output_n_float,k_, one_float,1, &beta, grad_bias.data_ptr(), 1); + + } + + + return { + grad_input, grad_offset, grad_mask, grad_weight, grad_bias + }; +} diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.cu b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.cu new file mode 100644 index 00000000..4140eacf --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.cu @@ -0,0 +1,402 @@ +#include "dcn_v2_im2col_cuda.h" +#include +#include +#include + +#include +#include + +#include +#include +#include + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N) +{ + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + + +__device__ float dmcn_im2col_bilinear_cuda(const float *bottom_data, const int data_width, + const int height, const int width, float h, float w) +{ + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h - h_low; + float lw = w - w_low; + float hh = 1 - lh, hw = 1 - lw; + + float v1 = 0; + if (h_low >= 0 && w_low >= 0) + v1 = bottom_data[h_low * data_width + w_low]; + float v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + v2 = bottom_data[h_low * data_width + w_high]; + float v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + v3 = bottom_data[h_high * data_width + w_low]; + float v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + v4 = bottom_data[h_high * data_width + w_high]; + + float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +__device__ float dmcn_get_gradient_weight_cuda(float argmax_h, float argmax_w, + const int h, const int w, const int height, const int width) +{ + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + float weight = 0; + if (h == argmax_h_low && w == argmax_w_low) + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + if (h == argmax_h_low && w == argmax_w_high) + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + if (h == argmax_h_high && w == argmax_w_low) + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + if (h == argmax_h_high && w == argmax_w_high) + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + return weight; +} + +__device__ float dmcn_get_coordinate_weight_cuda(float argmax_h, float argmax_w, + const int height, const int width, const float *im_data, + const int data_width, const int bp_dir) +{ + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + float weight = 0; + + if (bp_dir == 0) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + else if (bp_dir == 1) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + + return weight; +} + +__global__ void modulated_deformable_im2col_gpu_kernel(const int n, + const float *data_im, const float *data_offset, const float *data_mask, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int num_channels, const int deformable_group, + const int height_col, const int width_col, + float *data_col) +{ + // launch channels * batch_size * height_col * width_col cores + CUDA_KERNEL_LOOP(index, n) + { + // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow) + // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis + + // index index of output matrix + const int w_col = index % width_col; + const int h_col = (index / width_col) % height_col; + // const int b_col = (index / width_col / height_col) % batch_size; + const int b_col = (index / width_col / height_col / num_channels) % batch_size; + // const int c_im = (index / width_col / height_col) / batch_size; + const int c_im = (index / width_col / height_col) % num_channels; + // const int c_col = c_im * kernel_h * kernel_w; + const int c_col = c_im * kernel_h * kernel_w; + + // compute deformable group index + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + + // float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col; + //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in; + const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; + const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + + const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) + { + for (int j = 0; j < kernel_w; ++j) + { + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; + const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; + const float offset_h = data_offset_ptr[data_offset_h_ptr]; + const float offset_w = data_offset_ptr[data_offset_w_ptr]; + const float mask = data_mask_ptr[data_mask_hw_ptr]; + float val = static_cast(0); + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) + { + //const float map_h = i * dilation_h + offset_h; + //const float map_w = j * dilation_w + offset_w; + //const int cur_height = height - h_in; + //const int cur_width = width - w_in; + //val = dmcn_im2col_bilinear_cuda(data_im_ptr, width, cur_height, cur_width, map_h, map_w); + val = dmcn_im2col_bilinear_cuda(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val * mask; + // data_col_ptr += batch_size * height_col * width_col; + data_col_ptr += height_col * width_col; + } + } + } +} + +__global__ void modulated_deformable_col2im_gpu_kernel(const int n, + const float *data_col, const float *data_offset, const float *data_mask, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int deformable_group, + const int height_col, const int width_col, + float *grad_im) +{ + CUDA_KERNEL_LOOP(index, n) + { + const int j = (index / width_col / height_col / batch_size) % kernel_w; + const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; + // compute the start and end of the output + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = index % width_col; + int h_out = (index / width_col) % height_col; + int b = (index / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; + const float offset_h = data_offset_ptr[data_offset_h_ptr]; + const float offset_w = data_offset_ptr[data_offset_w_ptr]; + const float mask = data_mask_ptr[data_mask_hw_ptr]; + const float cur_inv_h_data = h_in + i * dilation_h + offset_h; + const float cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const float cur_top_grad = data_col[index] * mask; + const int cur_h = (int)cur_inv_h_data; + const int cur_w = (int)cur_inv_w_data; + for (int dy = -2; dy <= 2; dy++) + { + for (int dx = -2; dx <= 2; dx++) + { + if (cur_h + dy >= 0 && cur_h + dy < height && + cur_w + dx >= 0 && cur_w + dx < width && + abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) + { + int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + float weight = dmcn_get_gradient_weight_cuda(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); + atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + } + } + } + } +} + +__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n, + const float *data_col, const float *data_im, + const float *data_offset, const float *data_mask, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int offset_channels, const int deformable_group, + const int height_col, const int width_col, + float *grad_offset, float *grad_mask) +{ + CUDA_KERNEL_LOOP(index, n) + { + float val = 0, mval = 0; + int w = index % width_col; + int h = (index / width_col) % height_col; + int c = (index / width_col / height_col) % offset_channels; + int b = (index / width_col / height_col) / offset_channels; + // compute the start and end of the output + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; + const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; + const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) + { + const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); + const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); + const float offset_h = data_offset_ptr[data_offset_h_ptr]; + const float offset_w = data_offset_ptr[data_offset_w_ptr]; + const float mask = data_mask_ptr[data_mask_hw_ptr]; + float inv_h = h_in + i * dilation_h + offset_h; + float inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) + { + inv_h = inv_w = -2; + } + else + { + mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear_cuda(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w); + } + const float weight = dmcn_get_coordinate_weight_cuda( + inv_h, inv_w, + height, width, data_im_ptr + cnt * height * width, width, bp_dir); + val += weight * data_col_ptr[col_pos] * mask; + cnt += 1; + } + // KERNEL_ASSIGN(grad_offset[index], offset_req, val); + grad_offset[index] = val; + if (offset_c % 2 == 0) + // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval); + grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval; + } +} + +void modulated_deformable_im2col_cuda(cudaStream_t stream, + const float* data_im, const float* data_offset, const float* data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float* data_col) { + // num_axes should be smaller than block size + const int channel_per_deformable_group = channels / deformable_group; + const int num_kernels = channels * batch_size * height_col * width_col; + modulated_deformable_im2col_gpu_kernel + <<>>( + num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, + batch_size, channels, deformable_group, height_col, width_col, data_col); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); + } + +} + +void modulated_deformable_col2im_cuda(cudaStream_t stream, + const float* data_col, const float* data_offset, const float* data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float* grad_im){ + + const int channel_per_deformable_group = channels / deformable_group; + const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col; + modulated_deformable_col2im_gpu_kernel + <<>>( + num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im, + kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + batch_size, deformable_group, height_col, width_col, grad_im); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); + } + +} + +void modulated_deformable_col2im_coord_cuda(cudaStream_t stream, + const float* data_col, const float* data_im, const float* data_offset, const float* data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, + float* grad_offset, float* grad_mask) { + const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group; + const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group; + modulated_deformable_col2im_coord_gpu_kernel + <<>>( + num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im, + kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, + grad_offset, grad_mask); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err)); + } +} \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.h b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.h new file mode 100644 index 00000000..c8568319 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.h @@ -0,0 +1,101 @@ + +/*! + ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** + * + * COPYRIGHT + * + * All contributions by the University of California: + * Copyright (c) 2014-2017 The Regents of the University of California (Regents) + * All rights reserved. + * + * All other contributions: + * Copyright (c) 2014-2017, the respective contributors + * All rights reserved. + * + * Caffe uses a shared copyright model: each contributor holds copyright over + * their contributions to Caffe. The project versioning records all such + * contribution and copyright details. If a contributor wants to further mark + * their specific copyright on a particular contribution, they should indicate + * their copyright solely in the commit message of the change when it is + * committed. + * + * LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * CONTRIBUTION AGREEMENT + * + * By contributing to the BVLC/caffe repository through pull-request, comment, + * or otherwise, the contributor releases their content to the + * license and copyright terms herein. + * + ***************** END Caffe Copyright Notice and Disclaimer ******************** + * + * Copyright (c) 2018 Microsoft + * Licensed under The MIT License [see LICENSE for details] + * \file modulated_deformable_im2col.h + * \brief Function definitions of converting an image to + * column matrix based on kernel, padding, dilation, and offset. + * These functions are mainly used in deformable convolution operators. + * \ref: https://arxiv.org/abs/1811.11168 + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu + */ + +/***************** Adapted by Charles Shang *********************/ + +#ifndef DCN_V2_IM2COL_CUDA +#define DCN_V2_IM2COL_CUDA + +#ifdef __cplusplus +extern "C" +{ +#endif + + void modulated_deformable_im2col_cuda(cudaStream_t stream, + const float *data_im, const float *data_offset, const float *data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float *data_col); + + void modulated_deformable_col2im_cuda(cudaStream_t stream, + const float *data_col, const float *data_offset, const float *data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float *grad_im); + + void modulated_deformable_col2im_coord_cuda(cudaStream_t stream, + const float *data_col, const float *data_im, const float *data_offset, const float *data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, + float *grad_offset, float *grad_mask); + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.cu b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.cu new file mode 100644 index 00000000..bf99f0ca --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.cu @@ -0,0 +1,419 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The MIT License [see LICENSE for details] + * \file deformable_psroi_pooling.cu + * \brief + * \author Yi Li, Guodong Zhang, Jifeng Dai +*/ +/***************** Adapted by Charles Shang *********************/ + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N) +{ + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +template +__device__ T bilinear_interp_cuda( + const T *data, + const T x, + const T y, + const int width, + const int height) +{ + int x1 = floor(x); + int x2 = ceil(x); + int y1 = floor(y); + int y2 = ceil(y); + T dist_x = static_cast(x - x1); + T dist_y = static_cast(y - y1); + T value11 = data[y1 * width + x1]; + T value12 = data[y2 * width + x1]; + T value21 = data[y1 * width + x2]; + T value22 = data[y2 * width + x2]; + T value = (1 - dist_x) * (1 - dist_y) * value11 + + (1 - dist_x) * dist_y * value12 + + dist_x * (1 - dist_y) * value21 + + dist_x * dist_y * value22; + return value; +} + +template +__global__ void DeformablePSROIPoolForwardKernelCuda( + const int count, + const T *bottom_data, + const T spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const T *bottom_rois, const T *bottom_trans, + const int no_trans, + const T trans_std, + const int sample_per_part, + const int output_dim, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class, + T *top_data, + T *top_count) +{ + CUDA_KERNEL_LOOP(index, count) + { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const T *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + T roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + int part_h = floor(static_cast(ph) / pooled_height * part_size); + int part_w = floor(static_cast(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; + T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; + + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + T sum = 0; + int count = 0; + int gw = floor(static_cast(pw) * group_size / pooled_width); + int gh = floor(static_cast(ph) * group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + + const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width; + for (int ih = 0; ih < sample_per_part; ih++) + { + for (int iw = 0; iw < sample_per_part; iw++) + { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) + { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop * group_size + gh) * group_size + gw; + T val = bilinear_interp_cuda(offset_bottom_data + c * height * width, w, h, width, height); + sum += val; + count++; + } + } + top_data[index] = count == 0 ? static_cast(0) : sum / count; + top_count[index] = count; + } +} + +template +__global__ void DeformablePSROIPoolBackwardAccKernelCuda( + const int count, + const T *top_diff, + const T *top_count, + const int num_rois, + const T spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int output_dim, + T *bottom_data_diff, T *bottom_trans_diff, + const T *bottom_data, + const T *bottom_rois, + const T *bottom_trans, + const int no_trans, + const T trans_std, + const int sample_per_part, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class) +{ + CUDA_KERNEL_LOOP(index, count) + { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const T *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + T roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + int part_h = floor(static_cast(ph) / pooled_height * part_size); + int part_w = floor(static_cast(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; + T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; + + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + if (top_count[index] <= 0) + { + continue; + } + T diff_val = top_diff[index] / top_count[index]; + const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; + T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width; + int gw = floor(static_cast(pw) * group_size / pooled_width); + int gh = floor(static_cast(ph) * group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + + for (int ih = 0; ih < sample_per_part; ih++) + { + for (int iw = 0; iw < sample_per_part; iw++) + { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) + { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop * group_size + gh) * group_size + gw; + // backward on feature + int x0 = floor(w); + int x1 = ceil(w); + int y0 = floor(h); + int y1 = ceil(h); + T dist_x = w - x0, dist_y = h - y0; + T q00 = (1 - dist_x) * (1 - dist_y); + T q01 = (1 - dist_x) * dist_y; + T q10 = dist_x * (1 - dist_y); + T q11 = dist_x * dist_y; + int bottom_index_base = c * height * width; + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val); + + if (no_trans) + { + continue; + } + T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; + T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; + T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; + T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; + T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val; + diff_x *= roi_width; + T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val; + diff_y *= roi_height; + + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x); + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y); + } + } + } +} + +std::tuple +dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor"); + AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + const int channels_trans = no_trans ? 2 : trans.size(1); + const int num_bbox = bbox.size(0); + + AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); + auto pooled_height = pooled_size; + auto pooled_width = pooled_size; + + auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); + long out_size = num_bbox * output_dim * pooled_height * pooled_width; + auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); + + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + if (out.numel() == 0) + { + THCudaCheck(cudaGetLastError()); + return std::make_tuple(out, top_count); + } + + dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); + dim3 block(512); + + AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cuda_forward", [&] { + DeformablePSROIPoolForwardKernelCuda<<>>( + out_size, + input.contiguous().data_ptr(), + spatial_scale, + channels, + height, width, + pooled_height, + pooled_width, + bbox.contiguous().data_ptr(), + trans.contiguous().data_ptr(), + no_trans, + trans_std, + sample_per_part, + output_dim, + group_size, + part_size, + num_classes, + channels_each_class, + out.data_ptr(), + top_count.data_ptr()); + }); + THCudaCheck(cudaGetLastError()); + return std::make_tuple(out, top_count); +} + +std::tuple +dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad, + const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const at::Tensor &top_count, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor"); + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor"); + AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor"); + AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + const int channels_trans = no_trans ? 2 : trans.size(1); + const int num_bbox = bbox.size(0); + + AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); + auto pooled_height = pooled_size; + auto pooled_width = pooled_size; + long out_size = num_bbox * output_dim * pooled_height * pooled_width; + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options()); + auto trans_grad = at::zeros_like(trans); + + if (input_grad.numel() == 0) + { + THCudaCheck(cudaGetLastError()); + return std::make_tuple(input_grad, trans_grad); + } + + dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); + dim3 block(512); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cuda_backward", [&] { + DeformablePSROIPoolBackwardAccKernelCuda<<>>( + out_size, + out_grad.contiguous().data_ptr(), + top_count.contiguous().data_ptr(), + num_bbox, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + output_dim, + input_grad.contiguous().data_ptr(), + trans_grad.contiguous().data_ptr(), + input.contiguous().data_ptr(), + bbox.contiguous().data_ptr(), + trans.contiguous().data_ptr(), + no_trans, + trans_std, + sample_per_part, + group_size, + part_size, + num_classes, + channels_each_class); + }); + THCudaCheck(cudaGetLastError()); + return std::make_tuple(input_grad, trans_grad); +} \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/vision.h b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/vision.h new file mode 100644 index 00000000..f3672b11 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/vision.h @@ -0,0 +1,60 @@ +#pragma once +#include +#include +at::Tensor +dcn_v2_cuda_forward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int deformable_group); + +std::vector +dcn_v2_cuda_backward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const at::Tensor &grad_output, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int deformable_group); + + +std::tuple +dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std); + +std::tuple +dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad, + const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const at::Tensor &top_count, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std); \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/dcn_v2.h b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/dcn_v2.h new file mode 100644 index 00000000..de670bf9 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/dcn_v2.h @@ -0,0 +1,190 @@ +#pragma once + +#include "cpu/vision.h" + +#ifdef WITH_CUDA +#include "cuda/vision.h" +#endif + +at::Tensor +dcn_v2_forward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int deformable_group) +{ + if (input.type().is_cuda()) + { +#ifdef WITH_CUDA + return dcn_v2_cuda_forward(input, weight, bias, offset, mask, + kernel_h, kernel_w, + stride_h, stride_w, + pad_h, pad_w, + dilation_h, dilation_w, + deformable_group); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + else{ + return dcn_v2_cpu_forward(input, weight, bias, offset, mask, + kernel_h, kernel_w, + stride_h, stride_w, + pad_h, pad_w, + dilation_h, dilation_w, + deformable_group); + } +} + +std::vector +dcn_v2_backward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const at::Tensor &grad_output, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int deformable_group) +{ + if (input.type().is_cuda()) + { +#ifdef WITH_CUDA + return dcn_v2_cuda_backward(input, + weight, + bias, + offset, + mask, + grad_output, + kernel_h, kernel_w, + stride_h, stride_w, + pad_h, pad_w, + dilation_h, dilation_w, + deformable_group); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + else{ + return dcn_v2_cpu_backward(input, + weight, + bias, + offset, + mask, + grad_output, + kernel_h, kernel_w, + stride_h, stride_w, + pad_h, pad_w, + dilation_h, dilation_w, + deformable_group); + } +} + +std::tuple +dcn_v2_psroi_pooling_forward(const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + if (input.type().is_cuda()) + { +#ifdef WITH_CUDA + return dcn_v2_psroi_pooling_cuda_forward(input, + bbox, + trans, + no_trans, + spatial_scale, + output_dim, + group_size, + pooled_size, + part_size, + sample_per_part, + trans_std); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + else{ + return dcn_v2_psroi_pooling_cpu_forward(input, + bbox, + trans, + no_trans, + spatial_scale, + output_dim, + group_size, + pooled_size, + part_size, + sample_per_part, + trans_std); + } +} + +std::tuple +dcn_v2_psroi_pooling_backward(const at::Tensor &out_grad, + const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const at::Tensor &top_count, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + if (input.type().is_cuda()) + { +#ifdef WITH_CUDA + return dcn_v2_psroi_pooling_cuda_backward(out_grad, + input, + bbox, + trans, + top_count, + no_trans, + spatial_scale, + output_dim, + group_size, + pooled_size, + part_size, + sample_per_part, + trans_std); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + else{ + return dcn_v2_psroi_pooling_cpu_backward(out_grad, + input, + bbox, + trans, + top_count, + no_trans, + spatial_scale, + output_dim, + group_size, + pooled_size, + part_size, + sample_per_part, + trans_std); + } +} \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/vision.cpp b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/vision.cpp new file mode 100644 index 00000000..ff54233e --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/vision.cpp @@ -0,0 +1,9 @@ + +#include "dcn_v2.h" + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("dcn_v2_forward", &dcn_v2_forward, "dcn_v2_forward"); + m.def("dcn_v2_backward", &dcn_v2_backward, "dcn_v2_backward"); + m.def("dcn_v2_psroi_pooling_forward", &dcn_v2_psroi_pooling_forward, "dcn_v2_psroi_pooling_forward"); + m.def("dcn_v2_psroi_pooling_backward", &dcn_v2_psroi_pooling_backward, "dcn_v2_psroi_pooling_backward"); +} diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/testcpu.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/testcpu.py new file mode 100644 index 00000000..0a0b36eb --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/testcpu.py @@ -0,0 +1,270 @@ +#!/usr/bin/env python +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import time +import torch +import torch.nn as nn +from torch.autograd import gradcheck + +from dcn_v2 import dcn_v2_conv, DCNv2, DCN +from dcn_v2 import dcn_v2_pooling, DCNv2Pooling, DCNPooling + +deformable_groups = 1 +N, inC, inH, inW = 2, 2, 4, 4 +outC = 2 +kH, kW = 3, 3 + + +def conv_identify(weight, bias): + weight.data.zero_() + bias.data.zero_() + o, i, h, w = weight.shape + y = h//2 + x = w//2 + for p in range(i): + for q in range(o): + if p == q: + weight.data[q, p, y, x] = 1.0 + + +def check_zero_offset(): + conv_offset = nn.Conv2d(inC, deformable_groups * 2 * kH * kW, + kernel_size=(kH, kW), + stride=(1, 1), + padding=(1, 1), + bias=True) + + conv_mask = nn.Conv2d(inC, deformable_groups * 1 * kH * kW, + kernel_size=(kH, kW), + stride=(1, 1), + padding=(1, 1), + bias=True) + + dcn_v2 = DCNv2(inC, outC, (kH, kW), + stride=1, padding=1, dilation=1, + deformable_groups=deformable_groups) + + conv_offset.weight.data.zero_() + conv_offset.bias.data.zero_() + conv_mask.weight.data.zero_() + conv_mask.bias.data.zero_() + conv_identify(dcn_v2.weight, dcn_v2.bias) + + input = torch.randn(N, inC, inH, inW) + offset = conv_offset(input) + mask = conv_mask(input) + mask = torch.sigmoid(mask) + output = dcn_v2(input, offset, mask) + output *= 2 + d = (input - output).abs().max() + if d < 1e-10: + print('Zero offset passed') + else: + print('Zero offset failed') + print(input) + print(output) + +def check_gradient_dconv(): + + input = torch.rand(N, inC, inH, inW) * 0.01 + input.requires_grad = True + + offset = torch.randn(N, deformable_groups * 2 * kW * kH, inH, inW) * 2 + # offset.data.zero_() + # offset.data -= 0.5 + offset.requires_grad = True + + mask = torch.rand(N, deformable_groups * 1 * kW * kH, inH, inW) + # mask.data.zero_() + mask.requires_grad = True + mask = torch.sigmoid(mask) + + weight = torch.randn(outC, inC, kH, kW) + weight.requires_grad = True + + bias = torch.rand(outC) + bias.requires_grad = True + + stride = 1 + padding = 1 + dilation = 1 + + print('check_gradient_dconv: ', + gradcheck(dcn_v2_conv, (input, offset, mask, weight, bias, + stride, padding, dilation, deformable_groups), + eps=1e-3, atol=1e-4, rtol=1e-2)) + + +def check_pooling_zero_offset(): + + input = torch.randn(2, 16, 64, 64).zero_() + input[0, :, 16:26, 16:26] = 1. + input[1, :, 10:20, 20:30] = 2. + rois = torch.tensor([ + [0, 65, 65, 103, 103], + [1, 81, 41, 119, 79], + ]).float() + pooling = DCNv2Pooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=16, + no_trans=True, + group_size=1, + trans_std=0.0) + + out = pooling(input, rois, input.new()) + s = ', '.join(['%f' % out[i, :, :, :].mean().item() + for i in range(rois.shape[0])]) + print(s) + + dpooling = DCNv2Pooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=16, + no_trans=False, + group_size=1, + trans_std=0.0) + offset = torch.randn(20, 2, 7, 7).zero_() + dout = dpooling(input, rois, offset) + s = ', '.join(['%f' % dout[i, :, :, :].mean().item() + for i in range(rois.shape[0])]) + print(s) + + +def check_gradient_dpooling(): + input = torch.randn(2, 3, 5, 5) * 0.01 + N = 4 + batch_inds = torch.randint(2, (N, 1)).float() + x = torch.rand((N, 1)).float() * 15 + y = torch.rand((N, 1)).float() * 15 + w = torch.rand((N, 1)).float() * 10 + h = torch.rand((N, 1)).float() * 10 + rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) + offset = torch.randn(N, 2, 3, 3) + input.requires_grad = True + offset.requires_grad = True + + spatial_scale = 1.0 / 4 + pooled_size = 3 + output_dim = 3 + no_trans = 0 + group_size = 1 + trans_std = 0.0 + sample_per_part = 4 + part_size = pooled_size + + print('check_gradient_dpooling:', + gradcheck(dcn_v2_pooling, (input, rois, offset, + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size, + part_size, + sample_per_part, + trans_std), + eps=1e-4)) + + +def example_dconv(): + input = torch.randn(2, 64, 128, 128) + # wrap all things (offset and mask) in DCN + dcn = DCN(64, 64, kernel_size=(3, 3), stride=1, + padding=1, deformable_groups=2) + # print(dcn.weight.shape, input.shape) + output = dcn(input) + targert = output.new(*output.size()) + targert.data.uniform_(-0.01, 0.01) + error = (targert - output).mean() + error.backward() + print(output.shape) + + +def example_dpooling(): + input = torch.randn(2, 32, 64, 64) + batch_inds = torch.randint(2, (20, 1)).float() + x = torch.randint(256, (20, 1)).float() + y = torch.randint(256, (20, 1)).float() + w = torch.randint(64, (20, 1)).float() + h = torch.randint(64, (20, 1)).float() + rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) + offset = torch.randn(20, 2, 7, 7) + input.requires_grad = True + offset.requires_grad = True + + # normal roi_align + pooling = DCNv2Pooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=32, + no_trans=True, + group_size=1, + trans_std=0.1) + + # deformable pooling + dpooling = DCNv2Pooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=32, + no_trans=False, + group_size=1, + trans_std=0.1) + + out = pooling(input, rois, offset) + dout = dpooling(input, rois, offset) + print(out.shape) + print(dout.shape) + + target_out = out.new(*out.size()) + target_out.data.uniform_(-0.01, 0.01) + target_dout = dout.new(*dout.size()) + target_dout.data.uniform_(-0.01, 0.01) + e = (target_out - out).mean() + e.backward() + e = (target_dout - dout).mean() + e.backward() + + +def example_mdpooling(): + input = torch.randn(2, 32, 64, 64) + input.requires_grad = True + batch_inds = torch.randint(2, (20, 1)).float() + x = torch.randint(256, (20, 1)).float() + y = torch.randint(256, (20, 1)).float() + w = torch.randint(64, (20, 1)).float() + h = torch.randint(64, (20, 1)).float() + rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) + + # mdformable pooling (V2) + dpooling = DCNPooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=32, + no_trans=False, + group_size=1, + trans_std=0.1, + deform_fc_dim=1024) + + dout = dpooling(input, rois) + target = dout.new(*dout.size()) + target.data.uniform_(-0.1, 0.1) + error = (target - dout).mean() + error.backward() + print(dout.shape) + + +if __name__ == '__main__': + + example_dconv() + example_dpooling() + example_mdpooling() + + check_pooling_zero_offset() + # zero offset check + if inC == outC: + check_zero_offset() + + check_gradient_dpooling() + check_gradient_dconv() + # """ + # ****** Note: backward is not reentrant error may not be a serious problem, + # ****** since the max error is less than 1e-7, + # ****** Still looking for what trigger this problem + # """ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/testcuda.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/testcuda.py new file mode 100644 index 00000000..3bd5bd22 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/testcuda.py @@ -0,0 +1,270 @@ +#!/usr/bin/env python +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import time +import torch +import torch.nn as nn +from torch.autograd import gradcheck + +from dcn_v2 import dcn_v2_conv, DCNv2, DCN +from dcn_v2 import dcn_v2_pooling, DCNv2Pooling, DCNPooling + +deformable_groups = 1 +N, inC, inH, inW = 2, 2, 4, 4 +outC = 2 +kH, kW = 3, 3 + + +def conv_identify(weight, bias): + weight.data.zero_() + bias.data.zero_() + o, i, h, w = weight.shape + y = h//2 + x = w//2 + for p in range(i): + for q in range(o): + if p == q: + weight.data[q, p, y, x] = 1.0 + + +def check_zero_offset(): + conv_offset = nn.Conv2d(inC, deformable_groups * 2 * kH * kW, + kernel_size=(kH, kW), + stride=(1, 1), + padding=(1, 1), + bias=True).cuda() + + conv_mask = nn.Conv2d(inC, deformable_groups * 1 * kH * kW, + kernel_size=(kH, kW), + stride=(1, 1), + padding=(1, 1), + bias=True).cuda() + + dcn_v2 = DCNv2(inC, outC, (kH, kW), + stride=1, padding=1, dilation=1, + deformable_groups=deformable_groups).cuda() + + conv_offset.weight.data.zero_() + conv_offset.bias.data.zero_() + conv_mask.weight.data.zero_() + conv_mask.bias.data.zero_() + conv_identify(dcn_v2.weight, dcn_v2.bias) + + input = torch.randn(N, inC, inH, inW).cuda() + offset = conv_offset(input) + mask = conv_mask(input) + mask = torch.sigmoid(mask) + output = dcn_v2(input, offset, mask) + output *= 2 + d = (input - output).abs().max() + if d < 1e-10: + print('Zero offset passed') + else: + print('Zero offset failed') + print(input) + print(output) + +def check_gradient_dconv(): + + input = torch.rand(N, inC, inH, inW).cuda() * 0.01 + input.requires_grad = True + + offset = torch.randn(N, deformable_groups * 2 * kW * kH, inH, inW).cuda() * 2 + # offset.data.zero_() + # offset.data -= 0.5 + offset.requires_grad = True + + mask = torch.rand(N, deformable_groups * 1 * kW * kH, inH, inW).cuda() + # mask.data.zero_() + mask.requires_grad = True + mask = torch.sigmoid(mask) + + weight = torch.randn(outC, inC, kH, kW).cuda() + weight.requires_grad = True + + bias = torch.rand(outC).cuda() + bias.requires_grad = True + + stride = 1 + padding = 1 + dilation = 1 + + print('check_gradient_dconv: ', + gradcheck(dcn_v2_conv, (input, offset, mask, weight, bias, + stride, padding, dilation, deformable_groups), + eps=1e-3, atol=1e-4, rtol=1e-2)) + + +def check_pooling_zero_offset(): + + input = torch.randn(2, 16, 64, 64).cuda().zero_() + input[0, :, 16:26, 16:26] = 1. + input[1, :, 10:20, 20:30] = 2. + rois = torch.tensor([ + [0, 65, 65, 103, 103], + [1, 81, 41, 119, 79], + ]).cuda().float() + pooling = DCNv2Pooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=16, + no_trans=True, + group_size=1, + trans_std=0.0).cuda() + + out = pooling(input, rois, input.new()) + s = ', '.join(['%f' % out[i, :, :, :].mean().item() + for i in range(rois.shape[0])]) + print(s) + + dpooling = DCNv2Pooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=16, + no_trans=False, + group_size=1, + trans_std=0.0).cuda() + offset = torch.randn(20, 2, 7, 7).cuda().zero_() + dout = dpooling(input, rois, offset) + s = ', '.join(['%f' % dout[i, :, :, :].mean().item() + for i in range(rois.shape[0])]) + print(s) + + +def check_gradient_dpooling(): + input = torch.randn(2, 3, 5, 5).cuda() * 0.01 + N = 4 + batch_inds = torch.randint(2, (N, 1)).cuda().float() + x = torch.rand((N, 1)).cuda().float() * 15 + y = torch.rand((N, 1)).cuda().float() * 15 + w = torch.rand((N, 1)).cuda().float() * 10 + h = torch.rand((N, 1)).cuda().float() * 10 + rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) + offset = torch.randn(N, 2, 3, 3).cuda() + input.requires_grad = True + offset.requires_grad = True + + spatial_scale = 1.0 / 4 + pooled_size = 3 + output_dim = 3 + no_trans = 0 + group_size = 1 + trans_std = 0.0 + sample_per_part = 4 + part_size = pooled_size + + print('check_gradient_dpooling:', + gradcheck(dcn_v2_pooling, (input, rois, offset, + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size, + part_size, + sample_per_part, + trans_std), + eps=1e-4)) + + +def example_dconv(): + input = torch.randn(2, 64, 128, 128).cuda() + # wrap all things (offset and mask) in DCN + dcn = DCN(64, 64, kernel_size=(3, 3), stride=1, + padding=1, deformable_groups=2).cuda() + # print(dcn.weight.shape, input.shape) + output = dcn(input) + targert = output.new(*output.size()) + targert.data.uniform_(-0.01, 0.01) + error = (targert - output).mean() + error.backward() + print(output.shape) + + +def example_dpooling(): + input = torch.randn(2, 32, 64, 64).cuda() + batch_inds = torch.randint(2, (20, 1)).cuda().float() + x = torch.randint(256, (20, 1)).cuda().float() + y = torch.randint(256, (20, 1)).cuda().float() + w = torch.randint(64, (20, 1)).cuda().float() + h = torch.randint(64, (20, 1)).cuda().float() + rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) + offset = torch.randn(20, 2, 7, 7).cuda() + input.requires_grad = True + offset.requires_grad = True + + # normal roi_align + pooling = DCNv2Pooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=32, + no_trans=True, + group_size=1, + trans_std=0.1).cuda() + + # deformable pooling + dpooling = DCNv2Pooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=32, + no_trans=False, + group_size=1, + trans_std=0.1).cuda() + + out = pooling(input, rois, offset) + dout = dpooling(input, rois, offset) + print(out.shape) + print(dout.shape) + + target_out = out.new(*out.size()) + target_out.data.uniform_(-0.01, 0.01) + target_dout = dout.new(*dout.size()) + target_dout.data.uniform_(-0.01, 0.01) + e = (target_out - out).mean() + e.backward() + e = (target_dout - dout).mean() + e.backward() + + +def example_mdpooling(): + input = torch.randn(2, 32, 64, 64).cuda() + input.requires_grad = True + batch_inds = torch.randint(2, (20, 1)).cuda().float() + x = torch.randint(256, (20, 1)).cuda().float() + y = torch.randint(256, (20, 1)).cuda().float() + w = torch.randint(64, (20, 1)).cuda().float() + h = torch.randint(64, (20, 1)).cuda().float() + rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) + + # mdformable pooling (V2) + dpooling = DCNPooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=32, + no_trans=False, + group_size=1, + trans_std=0.1, + deform_fc_dim=1024).cuda() + + dout = dpooling(input, rois) + target = dout.new(*dout.size()) + target.data.uniform_(-0.1, 0.1) + error = (target - dout).mean() + error.backward() + print(dout.shape) + + +if __name__ == '__main__': + + example_dconv() + example_dpooling() + example_mdpooling() + + check_pooling_zero_offset() + # zero offset check + if inC == outC: + check_zero_offset() + + check_gradient_dpooling() + check_gradient_dconv() + # """ + # ****** Note: backward is not reentrant error may not be a serious problem, + # ****** since the max error is less than 1e-7, + # ****** Still looking for what trigger this problem + # """ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/LICENSE b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/LICENSE new file mode 100644 index 00000000..b2e3b520 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2019, Charles Shang +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/__init__.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/build/temp.linux-x86_64-cpython-37/build.ninja b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/build/temp.linux-x86_64-cpython-37/build.ninja new file mode 100644 index 00000000..d05f7a42 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/build/temp.linux-x86_64-cpython-37/build.ninja @@ -0,0 +1,23 @@ +ninja_required_version = 1.3 +cxx = c++ + +cflags = -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/src -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/TH -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/THC -I/ssd8/exec/huangjy/miniconda3/envs/lore/include/python3.7m -c +post_cflags = -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++14 +ldflags = + +rule compile + command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags + depfile = $out.d + deps = gcc + + + +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_cpu.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_cpu.cpp +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_im2col_cpu.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_im2col_cpu.cpp +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_psroi_pooling_cpu.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_psroi_pooling_cpu.cpp +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/src/vision.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/src/vision.cpp + + + + + diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/dcn_v2.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/dcn_v2.py new file mode 100644 index 00000000..d3f44c08 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/dcn_v2.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import math +import torch +from torch import nn +from torch.autograd import Function +from torch.nn.modules.utils import _pair +from torch.autograd.function import once_differentiable +from torch.onnx.symbolic_helper import parse_args +import _ext as _backend + + +class _DCNv2(Function): + #@parse_args('v', 'v', "v", 'v', 'v', "i", "i", "i", "i", "i") + #def symbolic(g, input, offset, weight, stride=1, padding=0,\ + # dilation=1, groups=1,\ + # deformable_groups=1, im2col_step=64): + # return g.op("DeformConv", input, offset, weight,strides_i=stride, \ + # padding_i = padding, dilation_i=dilation, groups_i=groups, \ + # deformable_group_i=deformable_groups, im2col_step_i = im2col_step, auto_pad_s="None") + + @staticmethod + def forward(ctx, input, offset, mask, weight, bias, + stride, padding, dilation, deformable_groups): + ctx.stride = _pair(stride) + ctx.padding = _pair(padding) + ctx.dilation = _pair(dilation) + ctx.kernel_size = _pair(weight.shape[2:4]) + ctx.deformable_groups = deformable_groups + output = _backend.dcn_v2_forward(input, weight, bias, + offset, mask, + ctx.kernel_size[0], ctx.kernel_size[1], + ctx.stride[0], ctx.stride[1], + ctx.padding[0], ctx.padding[1], + ctx.dilation[0], ctx.dilation[1], + ctx.deformable_groups) + ctx.save_for_backward(input, offset, mask, weight, bias) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input, offset, mask, weight, bias = ctx.saved_tensors + grad_input, grad_offset, grad_mask, grad_weight, grad_bias = \ + _backend.dcn_v2_backward(input, weight, + bias, + offset, mask, + grad_output, + ctx.kernel_size[0], ctx.kernel_size[1], + ctx.stride[0], ctx.stride[1], + ctx.padding[0], ctx.padding[1], + ctx.dilation[0], ctx.dilation[1], + ctx.deformable_groups) + + return grad_input, grad_offset, grad_mask, grad_weight, grad_bias,\ + None, None, None, None, + + +dcn_v2_conv = _DCNv2.apply + + +class DCNv2(nn.Module): + + def __init__(self, in_channels, out_channels, + kernel_size, stride, padding, dilation=1, deformable_groups=1): + super(DCNv2, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _pair(kernel_size) + self.stride = _pair(stride) + self.padding = _pair(padding) + self.dilation = _pair(dilation) + self.deformable_groups = deformable_groups + + self.weight = nn.Parameter(torch.Tensor( + out_channels, in_channels, *self.kernel_size)) + self.bias = nn.Parameter(torch.Tensor(out_channels)) + self.reset_parameters() + + def reset_parameters(self): + n = self.in_channels + for k in self.kernel_size: + n *= k + stdv = 1. / math.sqrt(n) + self.weight.data.uniform_(-stdv, stdv) + self.bias.data.zero_() + + def forward(self, input, offset, mask): + assert 2 * self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \ + offset.shape[1] + assert self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \ + mask.shape[1] + return dcn_v2_conv(input, offset, mask, + self.weight, + self.bias, + self.stride, + self.padding, + self.dilation, + self.deformable_groups) + + +class DCN(DCNv2): + + def __init__(self, in_channels, out_channels, + kernel_size, stride, padding, + dilation=1, deformable_groups=1): + super(DCN, self).__init__(in_channels, out_channels, + kernel_size, stride, padding, dilation, deformable_groups) + + channels_ = self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1] + self.conv_offset_mask = nn.Conv2d(self.in_channels, + channels_, + kernel_size=self.kernel_size, + stride=self.stride, + padding=self.padding, + bias=True) + self.init_offset() + + def init_offset(self): + self.conv_offset_mask.weight.data.zero_() + self.conv_offset_mask.bias.data.zero_() + + #@parse_args('v', 'v', "v", 'v', 'v', "i", "i", "i", "i", "i") + #def symbolic(g, input, offset, weight, stride=1, padding=0, + # dilation=1, groups=1, + # deformable_groups=1, im2col_step=64): + # return g.op("DeformConv", input, offset, weight,strides_i=stride, + # padding_i = padding, dilation_i=dilation, groups_i=groups, + # deformable_group_i=deformable_groups, im2col_step_i = im2col_step, auto_pad_s="None") + + def forward(self, input): + out = self.conv_offset_mask(input) + o1, o2, mask = torch.chunk(out, 3, dim=1) + offset = torch.cat((o1, o2), dim=1) + mask = torch.sigmoid(mask) + return dcn_v2_conv(input, offset, mask, + self.weight, self.bias, + self.stride, + self.padding, + self.dilation, + self.deformable_groups) + + + +class _DCNv2Pooling(Function): + @staticmethod + def forward(ctx, input, rois, offset, + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=.0): + ctx.spatial_scale = spatial_scale + ctx.no_trans = int(no_trans) + ctx.output_dim = output_dim + ctx.group_size = group_size + ctx.pooled_size = pooled_size + ctx.part_size = pooled_size if part_size is None else part_size + ctx.sample_per_part = sample_per_part + ctx.trans_std = trans_std + + output, output_count = \ + _backend.dcn_v2_psroi_pooling_forward(input, rois, offset, + ctx.no_trans, ctx.spatial_scale, + ctx.output_dim, ctx.group_size, + ctx.pooled_size, ctx.part_size, + ctx.sample_per_part, ctx.trans_std) + ctx.save_for_backward(input, rois, offset, output_count) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input, rois, offset, output_count = ctx.saved_tensors + grad_input, grad_offset = \ + _backend.dcn_v2_psroi_pooling_backward(grad_output, + input, + rois, + offset, + output_count, + ctx.no_trans, + ctx.spatial_scale, + ctx.output_dim, + ctx.group_size, + ctx.pooled_size, + ctx.part_size, + ctx.sample_per_part, + ctx.trans_std) + + return grad_input, None, grad_offset, \ + None, None, None, None, None, None, None, None + + +dcn_v2_pooling = _DCNv2Pooling.apply + + +class DCNv2Pooling(nn.Module): + + def __init__(self, + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=.0): + super(DCNv2Pooling, self).__init__() + self.spatial_scale = spatial_scale + self.pooled_size = pooled_size + self.output_dim = output_dim + self.no_trans = no_trans + self.group_size = group_size + self.part_size = pooled_size if part_size is None else part_size + self.sample_per_part = sample_per_part + self.trans_std = trans_std + + def forward(self, input, rois, offset): + assert input.shape[1] == self.output_dim + if self.no_trans: + offset = input.new() + return dcn_v2_pooling(input, rois, offset, + self.spatial_scale, + self.pooled_size, + self.output_dim, + self.no_trans, + self.group_size, + self.part_size, + self.sample_per_part, + self.trans_std) + + +class DCNPooling(DCNv2Pooling): + + def __init__(self, + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=.0, + deform_fc_dim=1024): + super(DCNPooling, self).__init__(spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size, + part_size, + sample_per_part, + trans_std) + + self.deform_fc_dim = deform_fc_dim + + if not no_trans: + self.offset_mask_fc = nn.Sequential( + nn.Linear(self.pooled_size * self.pooled_size * + self.output_dim, self.deform_fc_dim), + nn.ReLU(inplace=True), + nn.Linear(self.deform_fc_dim, self.deform_fc_dim), + nn.ReLU(inplace=True), + nn.Linear(self.deform_fc_dim, self.pooled_size * + self.pooled_size * 3) + ) + self.offset_mask_fc[4].weight.data.zero_() + self.offset_mask_fc[4].bias.data.zero_() + + def forward(self, input, rois): + offset = input.new() + + if not self.no_trans: + + # do roi_align first + n = rois.shape[0] + roi = dcn_v2_pooling(input, rois, offset, + self.spatial_scale, + self.pooled_size, + self.output_dim, + True, # no trans + self.group_size, + self.part_size, + self.sample_per_part, + self.trans_std) + + # build mask and offset + offset_mask = self.offset_mask_fc(roi.view(n, -1)) + offset_mask = offset_mask.view( + n, 3, self.pooled_size, self.pooled_size) + o1, o2, mask = torch.chunk(offset_mask, 3, dim=1) + offset = torch.cat((o1, o2), dim=1) + mask = torch.sigmoid(mask) + + # do pooling with offset and mask + return dcn_v2_pooling(input, rois, offset, + self.spatial_scale, + self.pooled_size, + self.output_dim, + self.no_trans, + self.group_size, + self.part_size, + self.sample_per_part, + self.trans_std) * mask + # only roi_align + return dcn_v2_pooling(input, rois, offset, + self.spatial_scale, + self.pooled_size, + self.output_dim, + self.no_trans, + self.group_size, + self.part_size, + self.sample_per_part, + self.trans_std) diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/make.sh b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/make.sh new file mode 100755 index 00000000..f1f15c0e --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/make.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +python setup.py build develop diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/setup.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/setup.py new file mode 100644 index 00000000..0e0dc964 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/setup.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python + +import os +import glob + +import torch + +from torch.utils.cpp_extension import CUDA_HOME +from torch.utils.cpp_extension import CppExtension +from torch.utils.cpp_extension import CUDAExtension + +from setuptools import find_packages +from setuptools import setup + +requirements = ["torch", "torchvision"] + + +def get_extensions(): + this_dir = os.path.dirname(os.path.abspath(__file__)) + extensions_dir = os.path.join(this_dir, "src") + + main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) + source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) + source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) + + os.environ["CC"] = "g++" + sources = main_file + source_cpu + extension = CppExtension + extra_compile_args = {"cxx": []} + define_macros = [] + + + if torch.cuda.is_available() and CUDA_HOME is not None: + extension = CUDAExtension + sources += source_cuda + define_macros += [("WITH_CUDA", None)] + extra_compile_args["nvcc"] = [ + "-DCUDA_HAS_FP16=1", + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + ] + else: + #raise NotImplementedError('Cuda is not available') + pass + + + sources = [os.path.join(extensions_dir, s) for s in sources] + include_dirs = [extensions_dir] + ext_modules = [ + extension( + "_ext", + sources, + include_dirs=include_dirs, + define_macros=define_macros, + extra_compile_args=extra_compile_args, + ) + ] + return ext_modules + +setup( + name="DCNv2", + version="0.1", + author="charlesshang", + url="https://github.com/charlesshang/DCNv2", + description="deformable convolutional networks", + packages=find_packages(exclude=("configs", "tests",)), + # install_requires=requirements, + ext_modules=get_extensions(), + cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, +) \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_cpu.cpp b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_cpu.cpp new file mode 100644 index 00000000..a94273e2 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_cpu.cpp @@ -0,0 +1,233 @@ +#include +#include "cpu/dcn_v2_im2col_cpu.h" + +#include +//#include + +#include +//#include +//#include + +//extern THCState *state; + +// author: Charles Shang +// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu +// modified from the CUDA version for CPU use by Daniel K. Suhendro + +at::Tensor +dcn_v2_cpu_forward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int deformable_group) +{ + // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask)); + /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); + AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); + AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); + AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");*/ + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + + const int channels_out = weight.size(0); + const int channels_kernel = weight.size(1); + const int kernel_h_ = weight.size(2); + const int kernel_w_ = weight.size(3); + + // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h); + // printf("Channels: %d %d\n", channels, channels_kernel); + // printf("Channels: %d %d\n", channels_out, channels_kernel); + + AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, + "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); + + AT_ASSERTM(channels == channels_kernel, + "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); + + const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + + auto ones = at::ones({height_out, width_out}, input.options()); + auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); + auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); + + using scalar_t = float; + for (int b = 0; b < batch; b++) + { + auto input_n = input.select(0, b); + auto offset_n = offset.select(0, b); + auto mask_n = mask.select(0, b); + auto output_n = output.select(0, b); + + // Do Bias first: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + // (N x 1) (1 x M) + long m_ = channels_out; + long n_ = height_out * width_out; + long k_ = 1; + THFloatBlas_gemm('t', 'n', n_, m_, k_, 1.0f, + ones.contiguous().data(), k_, + bias.contiguous().data(), k_, 0.0f, + output_n.data(), n_); + + modulated_deformable_im2col_cpu(input_n.data(), + offset_n.data(), + mask_n.data(), + 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, + deformable_group, + columns.data()); + + //(k * m) x (m * n) + // Y = WC + long m = channels_out; + long n = height_out * width_out; + long k = channels * kernel_h * kernel_w; + THFloatBlas_gemm('n', 'n', n, m, k, 1.0f, + columns.data(), n, + weight.data(), k, 1.0f, + output_n.data(), n); + } + return output; +} + +std::vector dcn_v2_cpu_backward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const at::Tensor &grad_output, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int deformable_group) +{ + + THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous"); + THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous"); + + /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); + AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); + AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); + AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");*/ + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + + const int channels_out = weight.size(0); + const int channels_kernel = weight.size(1); + const int kernel_h_ = weight.size(2); + const int kernel_w_ = weight.size(3); + + AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, + "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); + + AT_ASSERTM(channels == channels_kernel, + "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); + + const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + + auto ones = at::ones({height_out, width_out}, input.options()); + auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); + auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); + + auto grad_input = at::zeros_like(input); + auto grad_weight = at::zeros_like(weight); + auto grad_bias = at::zeros_like(bias); + auto grad_offset = at::zeros_like(offset); + auto grad_mask = at::zeros_like(mask); + + using scalar_t = float; + + for (int b = 0; b < batch; b++) + { + auto input_n = input.select(0, b); + auto offset_n = offset.select(0, b); + auto mask_n = mask.select(0, b); + auto grad_output_n = grad_output.select(0, b); + auto grad_input_n = grad_input.select(0, b); + auto grad_offset_n = grad_offset.select(0, b); + auto grad_mask_n = grad_mask.select(0, b); + + long m = channels * kernel_h * kernel_w; + long n = height_out * width_out; + long k = channels_out; + + THFloatBlas_gemm('n', 't', n, m, k, 1.0f, + grad_output_n.data(), n, + weight.data(), m, 0.0f, + columns.data(), n); + + // gradient w.r.t. input coordinate data + modulated_deformable_col2im_coord_cpu(columns.data(), + input_n.data(), + offset_n.data(), + mask_n.data(), + 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, + grad_offset_n.data(), + grad_mask_n.data()); + // gradient w.r.t. input data + modulated_deformable_col2im_cpu(columns.data(), + offset_n.data(), + mask_n.data(), + 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, + grad_input_n.data()); + + // gradient w.r.t. weight, dWeight should accumulate across the batch and group + modulated_deformable_im2col_cpu(input_n.data(), + offset_n.data(), + mask_n.data(), + 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, + columns.data()); + + long m_ = channels_out; + long n_ = channels * kernel_h * kernel_w; + long k_ = height_out * width_out; + + THFloatBlas_gemm('t', 'n', n_, m_, k_, 1.0f, + columns.data(), k_, + grad_output_n.data(), k_, 1.0f, + grad_weight.data(), n_); + + // gradient w.r.t. bias + // long m_ = channels_out; + // long k__ = height_out * width_out; + THFloatBlas_gemv('t', k_, m_, 1.0f, + grad_output_n.data(), k_, + ones.data(), 1, 1.0f, + grad_bias.data(), 1); + } + + return { + grad_input, grad_offset, grad_mask, grad_weight, grad_bias + }; +} \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_im2col_cpu.cpp b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_im2col_cpu.cpp new file mode 100644 index 00000000..1704a60d --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_im2col_cpu.cpp @@ -0,0 +1,395 @@ +#include "dcn_v2_im2col_cpu.h" +#include +#include +#include + +#include +//#include + +#include +//#include +//#include + +// modified from the CUDA version for CPU use by Daniel K. Suhendro + +/*#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N) +{ + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +}*/ + + +float dmcn_im2col_bilinear_cpu(const float *bottom_data, const int data_width, + const int height, const int width, float h, float w) +{ + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h - h_low; + float lw = w - w_low; + float hh = 1 - lh, hw = 1 - lw; + + float v1 = 0; + if (h_low >= 0 && w_low >= 0) + v1 = bottom_data[h_low * data_width + w_low]; + float v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + v2 = bottom_data[h_low * data_width + w_high]; + float v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + v3 = bottom_data[h_high * data_width + w_low]; + float v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + v4 = bottom_data[h_high * data_width + w_high]; + + float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +float dmcn_get_gradient_weight_cpu(float argmax_h, float argmax_w, + const int h, const int w, const int height, const int width) +{ + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + float weight = 0; + if (h == argmax_h_low && w == argmax_w_low) + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + if (h == argmax_h_low && w == argmax_w_high) + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + if (h == argmax_h_high && w == argmax_w_low) + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + if (h == argmax_h_high && w == argmax_w_high) + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + return weight; +} + +float dmcn_get_coordinate_weight_cpu(float argmax_h, float argmax_w, + const int height, const int width, const float *im_data, + const int data_width, const int bp_dir) +{ + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + float weight = 0; + + if (bp_dir == 0) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + else if (bp_dir == 1) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + + return weight; +} + +void modulated_deformable_im2col_cpu_kernel(const int n, const float *data_im, const float *data_offset, const float *data_mask, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int num_channels, const int deformable_group, + const int height_col, const int width_col, + float *data_col) +{ + // launch channels * batch_size * height_col * width_col cores + for(int index=0; index(0); + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) + { + //const float map_h = i * dilation_h + offset_h; + //const float map_w = j * dilation_w + offset_w; + //const int cur_height = height - h_in; + //const int cur_width = width - w_in; + //val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, cur_height, cur_width, map_h, map_w); + val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val * mask; + // data_col_ptr += batch_size * height_col * width_col; + data_col_ptr += height_col * width_col; + } + } + } +} + +void modulated_deformable_col2im_cpu_kernel(const int n, const float *data_col, const float *data_offset, const float *data_mask, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int deformable_group, + const int height_col, const int width_col, + float *grad_im) +{ + for(int index = 0; index < n; index++) + { + const int j = (index / width_col / height_col / batch_size) % kernel_w; + const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; + // compute the start and end of the output + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = index % width_col; + int h_out = (index / width_col) % height_col; + int b = (index / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; + const float offset_h = data_offset_ptr[data_offset_h_ptr]; + const float offset_w = data_offset_ptr[data_offset_w_ptr]; + const float mask = data_mask_ptr[data_mask_hw_ptr]; + const float cur_inv_h_data = h_in + i * dilation_h + offset_h; + const float cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const float cur_top_grad = data_col[index] * mask; + const int cur_h = (int)cur_inv_h_data; + const int cur_w = (int)cur_inv_w_data; + + for (int dy = -2; dy <= 2; dy++) + { + for (int dx = -2; dx <= 2; dx++) + { + if (cur_h + dy >= 0 && cur_h + dy < height && + cur_w + dx >= 0 && cur_w + dx < width && + abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) + { + int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + float weight = dmcn_get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); + //atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad; + + } + } + } + } +} + +void modulated_deformable_col2im_coord_cpu_kernel(const int n, const float *data_col, const float *data_im, + const float *data_offset, const float *data_mask, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int offset_channels, const int deformable_group, + const int height_col, const int width_col, + float *grad_offset, float *grad_mask) +{ + for(int index = 0; index < n; index++) + { + float val = 0, mval = 0; + int w = index % width_col; + int h = (index / width_col) % height_col; + int c = (index / width_col / height_col) % offset_channels; + int b = (index / width_col / height_col) / offset_channels; + // compute the start and end of the output + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; + const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; + const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) + { + const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); + const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); + const float offset_h = data_offset_ptr[data_offset_h_ptr]; + const float offset_w = data_offset_ptr[data_offset_w_ptr]; + const float mask = data_mask_ptr[data_mask_hw_ptr]; + float inv_h = h_in + i * dilation_h + offset_h; + float inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) + { + inv_h = inv_w = -2; + } + else + { + mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear_cpu(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w); + } + const float weight = dmcn_get_coordinate_weight_cpu( + inv_h, inv_w, + height, width, data_im_ptr + cnt * height * width, width, bp_dir); + val += weight * data_col_ptr[col_pos] * mask; + cnt += 1; + } + // KERNEL_ASSIGN(grad_offset[index], offset_req, val); + grad_offset[index] = val; + if (offset_c % 2 == 0) + // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval); + grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval; + } +} + +void modulated_deformable_im2col_cpu(const float* data_im, const float* data_offset, const float* data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float* data_col) { + // num_axes should be smaller than block size + const int channel_per_deformable_group = channels / deformable_group; + const int num_kernels = channels * batch_size * height_col * width_col; + modulated_deformable_im2col_cpu_kernel( + num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, + batch_size, channels, deformable_group, height_col, width_col, data_col); + + /*cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); + }*/ + +} + +void modulated_deformable_col2im_cpu(const float* data_col, const float* data_offset, const float* data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float* grad_im){ + + const int channel_per_deformable_group = channels / deformable_group; + const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col; + modulated_deformable_col2im_cpu_kernel( + num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im, + kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + batch_size, deformable_group, height_col, width_col, grad_im); + /*cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); + }*/ + +} + +void modulated_deformable_col2im_coord_cpu(const float* data_col, const float* data_im, const float* data_offset, const float* data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, + float* grad_offset, float* grad_mask) { + const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group; + const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group; + modulated_deformable_col2im_coord_cpu_kernel( + num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im, + kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, + grad_offset, grad_mask); + /*cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err)); + }*/ +} \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_im2col_cpu.h b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_im2col_cpu.h new file mode 100644 index 00000000..bad5c528 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_im2col_cpu.h @@ -0,0 +1,99 @@ + +/*! + ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** + * + * COPYRIGHT + * + * All contributions by the University of California: + * Copyright (c) 2014-2017 The Regents of the University of California (Regents) + * All rights reserved. + * + * All other contributions: + * Copyright (c) 2014-2017, the respective contributors + * All rights reserved. + * + * Caffe uses a shared copyright model: each contributor holds copyright over + * their contributions to Caffe. The project versioning records all such + * contribution and copyright details. If a contributor wants to further mark + * their specific copyright on a particular contribution, they should indicate + * their copyright solely in the commit message of the change when it is + * committed. + * + * LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * CONTRIBUTION AGREEMENT + * + * By contributing to the BVLC/caffe repository through pull-request, comment, + * or otherwise, the contributor releases their content to the + * license and copyright terms herein. + * + ***************** END Caffe Copyright Notice and Disclaimer ******************** + * + * Copyright (c) 2018 Microsoft + * Licensed under The MIT License [see LICENSE for details] + * \file modulated_deformable_im2col.h + * \brief Function definitions of converting an image to + * column matrix based on kernel, padding, dilation, and offset. + * These functions are mainly used in deformable convolution operators. + * \ref: https://arxiv.org/abs/1811.11168 + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu + */ + +/***************** Adapted by Charles Shang *********************/ +// modified from the CUDA version for CPU use by Daniel K. Suhendro + +#ifndef DCN_V2_IM2COL_CPU +#define DCN_V2_IM2COL_CPU + +#ifdef __cplusplus +extern "C" +{ +#endif + + void modulated_deformable_im2col_cpu(const float *data_im, const float *data_offset, const float *data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float *data_col); + + void modulated_deformable_col2im_cpu(const float *data_col, const float *data_offset, const float *data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float *grad_im); + + void modulated_deformable_col2im_coord_cpu(const float *data_col, const float *data_im, const float *data_offset, const float *data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, + float *grad_offset, float *grad_mask); + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_psroi_pooling_cpu.cpp b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_psroi_pooling_cpu.cpp new file mode 100644 index 00000000..6e41aaed --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_psroi_pooling_cpu.cpp @@ -0,0 +1,426 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The MIT License [see LICENSE for details] + * \file deformable_psroi_pooling.cu + * \brief + * \author Yi Li, Guodong Zhang, Jifeng Dai +*/ +/***************** Adapted by Charles Shang *********************/ +// modified from the CUDA version for CPU use by Daniel K. Suhendro + +#include +#include +#include + +#include +//#include + +#include +//#include +//#include + +/*#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N) +{ + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +}*/ + +template +T bilinear_interp_cpu( + const T *data, + const T x, + const T y, + const int width, + const int height) +{ + int x1 = floor(x); + int x2 = ceil(x); + int y1 = floor(y); + int y2 = ceil(y); + T dist_x = static_cast(x - x1); + T dist_y = static_cast(y - y1); + T value11 = data[y1 * width + x1]; + T value12 = data[y2 * width + x1]; + T value21 = data[y1 * width + x2]; + T value22 = data[y2 * width + x2]; + T value = (1 - dist_x) * (1 - dist_y) * value11 + + (1 - dist_x) * dist_y * value12 + + dist_x * (1 - dist_y) * value21 + + dist_x * dist_y * value22; + return value; +} + +template + void DeformablePSROIPoolForwardKernelCpu( + const int count, + const T *bottom_data, + const T spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const T *bottom_rois, const T *bottom_trans, + const int no_trans, + const T trans_std, + const int sample_per_part, + const int output_dim, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class, + T *top_data, + T *top_count) +{ + for(int index = 0; index < count; index++) + { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const T *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + T roi_width = std::max(roi_end_w - roi_start_w, T(0.1)); //avoid 0 + T roi_height = std::max(roi_end_h - roi_start_h, T(0.1)); + + // Compute w and h at bottom + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + int part_h = floor(static_cast(ph) / pooled_height * part_size); + int part_w = floor(static_cast(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; + T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; + + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + T sum = 0; + int count = 0; + int gw = floor(static_cast(pw) * group_size / pooled_width); + int gh = floor(static_cast(ph) * group_size / pooled_height); + gw = std::min(std::max(gw, 0), group_size - 1); + gh = std::min(std::max(gh, 0), group_size - 1); + + const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width; + for (int ih = 0; ih < sample_per_part; ih++) + { + for (int iw = 0; iw < sample_per_part; iw++) + { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) + { + continue; + } + w = std::min(std::max(w, T(0.)), width - T(1.)); + h = std::min(std::max(h, T(0.)), height - T(1.)); + int c = (ctop * group_size + gh) * group_size + gw; + T val = bilinear_interp_cpu(offset_bottom_data + c * height * width, w, h, width, height); + sum += val; + count++; + } + } + top_data[index] = count == 0 ? static_cast(0) : sum / count; + top_count[index] = count; + } +} + +template +void DeformablePSROIPoolBackwardAccKernelCpu( + const int count, + const T *top_diff, + const T *top_count, + const int num_rois, + const T spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int output_dim, + T *bottom_data_diff, T *bottom_trans_diff, + const T *bottom_data, + const T *bottom_rois, + const T *bottom_trans, + const int no_trans, + const T trans_std, + const int sample_per_part, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class) +{ + for(int index = 0; index < count; index++) + { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const T *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + T roi_width = std::max(roi_end_w - roi_start_w, T(0.1)); //avoid 0 + T roi_height = std::max(roi_end_h - roi_start_h, T(0.1)); + + // Compute w and h at bottom + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + int part_h = floor(static_cast(ph) / pooled_height * part_size); + int part_w = floor(static_cast(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; + T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; + + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + if (top_count[index] <= 0) + { + continue; + } + T diff_val = top_diff[index] / top_count[index]; + const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; + T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width; + int gw = floor(static_cast(pw) * group_size / pooled_width); + int gh = floor(static_cast(ph) * group_size / pooled_height); + gw = std::min(std::max(gw, 0), group_size - 1); + gh = std::min(std::max(gh, 0), group_size - 1); + + for (int ih = 0; ih < sample_per_part; ih++) + { + for (int iw = 0; iw < sample_per_part; iw++) + { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) + { + continue; + } + w = std::min(std::max(w, T(0.)), width - T(1.)); + h = std::min(std::max(h, T(0.)), height - T(1.)); + int c = (ctop * group_size + gh) * group_size + gw; + // backward on feature + int x0 = floor(w); + int x1 = ceil(w); + int y0 = floor(h); + int y1 = ceil(h); + T dist_x = w - x0, dist_y = h - y0; + T q00 = (1 - dist_x) * (1 - dist_y); + T q01 = (1 - dist_x) * dist_y; + T q10 = dist_x * (1 - dist_y); + T q11 = dist_x * dist_y; + int bottom_index_base = c * height * width; + /*atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);*/ + *(offset_bottom_data_diff + bottom_index_base + y0 * width + x0) += q00 * diff_val; + *(offset_bottom_data_diff + bottom_index_base + y1 * width + x0) += q01 * diff_val; + *(offset_bottom_data_diff + bottom_index_base + y0 * width + x1) += q10 * diff_val; + *(offset_bottom_data_diff + bottom_index_base + y1 * width + x1) += q11 * diff_val; + + + if (no_trans) + { + continue; + } + T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; + T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; + T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; + T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; + T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val; + diff_x *= roi_width; + T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val; + diff_y *= roi_height; + + /*atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x); + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);*/ + *(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w) += diff_x; + *(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w) += diff_y; + } + } + } +} + +std::tuple +dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor"); + AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");*/ + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + const int channels_trans = no_trans ? 2 : trans.size(1); + const int num_bbox = bbox.size(0); + + AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); + auto pooled_height = pooled_size; + auto pooled_width = pooled_size; + + auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); + long out_size = num_bbox * output_dim * pooled_height * pooled_width; + auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); + + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + //cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + if (out.numel() == 0) + { + //THCudaCheck(cudaGetLastError()); + return std::make_tuple(out, top_count); + } + + /*dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); + dim3 block(512);*/ + + AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cpu_forward", [&] { + DeformablePSROIPoolForwardKernelCpu( + out_size, + input.contiguous().data(), + spatial_scale, + channels, + height, width, + pooled_height, + pooled_width, + bbox.contiguous().data(), + trans.contiguous().data(), + no_trans, + trans_std, + sample_per_part, + output_dim, + group_size, + part_size, + num_classes, + channels_each_class, + out.data(), + top_count.data()); + }); + //THCudaCheck(cudaGetLastError()); + return std::make_tuple(out, top_count); +} + +std::tuple +dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad, + const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const at::Tensor &top_count, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + /*AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor"); + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor"); + AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor"); + AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor");*/ + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + const int channels_trans = no_trans ? 2 : trans.size(1); + const int num_bbox = bbox.size(0); + + AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); + auto pooled_height = pooled_size; + auto pooled_width = pooled_size; + long out_size = num_bbox * output_dim * pooled_height * pooled_width; + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options()); + auto trans_grad = at::zeros_like(trans); + + if (input_grad.numel() == 0) + { + //THCudaCheck(cudaGetLastError()); + return std::make_tuple(input_grad, trans_grad); + } + + /*dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); + dim3 block(512); + cudaStream_t stream = at::cuda::getCurrentCUDAStream();*/ + + AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cpu_backward", [&] { + DeformablePSROIPoolBackwardAccKernelCpu( + out_size, + out_grad.contiguous().data(), + top_count.contiguous().data(), + num_bbox, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + output_dim, + input_grad.contiguous().data(), + trans_grad.contiguous().data(), + input.contiguous().data(), + bbox.contiguous().data(), + trans.contiguous().data(), + no_trans, + trans_std, + sample_per_part, + group_size, + part_size, + num_classes, + channels_each_class); + }); + //THCudaCheck(cudaGetLastError()); + return std::make_tuple(input_grad, trans_grad); +} \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/vision.h b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/vision.h new file mode 100644 index 00000000..d5fbf1f0 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/vision.h @@ -0,0 +1,60 @@ +#pragma once +#include + +at::Tensor +dcn_v2_cpu_forward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int deformable_group); + +std::vector +dcn_v2_cpu_backward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const at::Tensor &grad_output, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int deformable_group); + + +std::tuple +dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std); + +std::tuple +dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad, + const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const at::Tensor &top_count, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std); \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_cuda.cu b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_cuda.cu new file mode 100644 index 00000000..9711a497 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_cuda.cu @@ -0,0 +1,335 @@ +#include +#include "cuda/dcn_v2_im2col_cuda.h" + +#include +#include + +#include +#include +#include + +THCState *state = at::globalContext().lazyInitCUDA(); + +// author: Charles Shang +// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu + +// [batch gemm] +// https://github.com/pytorch/pytorch/blob/master/aten/src/THC/generic/THCTensorMathBlas.cu + +__global__ void createBatchGemmBuffer(const float **input_b, float **output_b, + float **columns_b, const float **ones_b, + const float **weight_b, const float **bias_b, + float *input, float *output, + float *columns, float *ones, + float *weight, float *bias, + const int input_stride, const int output_stride, + const int columns_stride, const int ones_stride, + const int num_batches) +{ + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_batches) + { + input_b[idx] = input + idx * input_stride; + output_b[idx] = output + idx * output_stride; + columns_b[idx] = columns + idx * columns_stride; + ones_b[idx] = ones + idx * ones_stride; + // share weights and bias within a Mini-Batch + weight_b[idx] = weight; + bias_b[idx] = bias; + } +} + +at::Tensor +dcn_v2_cuda_forward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int deformable_group) +{ + using scalar_t = float; + // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask)); + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); + AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); + AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); + AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + + const int channels_out = weight.size(0); + const int channels_kernel = weight.size(1); + const int kernel_h_ = weight.size(2); + const int kernel_w_ = weight.size(3); + + // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h); + // printf("Channels: %d %d\n", channels, channels_kernel); + // printf("Channels: %d %d\n", channels_out, channels_kernel); + + AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, + "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); + + AT_ASSERTM(channels == channels_kernel, + "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); + + const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + + auto ones = at::ones({batch, height_out, width_out}, input.options()); + auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); + auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); + + // prepare for batch-wise computing, which is significantly faster than instance-wise computing + // when batch size is large. + // launch batch threads + int matrices_size = batch * sizeof(float *); + auto input_b = static_cast(THCudaMalloc(state, matrices_size)); + auto output_b = static_cast(THCudaMalloc(state, matrices_size)); + auto columns_b = static_cast(THCudaMalloc(state, matrices_size)); + auto ones_b = static_cast(THCudaMalloc(state, matrices_size)); + auto weight_b = static_cast(THCudaMalloc(state, matrices_size)); + auto bias_b = static_cast(THCudaMalloc(state, matrices_size)); + + const int block = 128; + const int grid = (batch + block - 1) / block; + + createBatchGemmBuffer<<>>( + input_b, output_b, + columns_b, ones_b, + weight_b, bias_b, + input.data(), + output.data(), + columns.data(), + ones.data(), + weight.data(), + bias.data(), + channels * width * height, + channels_out * width_out * height_out, + channels * kernel_h * kernel_w * height_out * width_out, + height_out * width_out, + batch); + + long m_ = channels_out; + long n_ = height_out * width_out; + long k_ = 1; + THCudaBlas_SgemmBatched(state, + 't', + 'n', + n_, + m_, + k_, + 1.0f, + ones_b, k_, + bias_b, k_, + 0.0f, + output_b, n_, + batch); + + modulated_deformable_im2col_cuda(THCState_getCurrentStream(state), + input.data(), + offset.data(), + mask.data(), + batch, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, + deformable_group, + columns.data()); + + long m = channels_out; + long n = height_out * width_out; + long k = channels * kernel_h * kernel_w; + THCudaBlas_SgemmBatched(state, + 'n', + 'n', + n, + m, + k, + 1.0f, + (const float **)columns_b, n, + weight_b, k, + 1.0f, + output_b, n, + batch); + + THCudaFree(state, input_b); + THCudaFree(state, output_b); + THCudaFree(state, columns_b); + THCudaFree(state, ones_b); + THCudaFree(state, weight_b); + THCudaFree(state, bias_b); + return output; +} + +__global__ void createBatchGemmBufferBackward( + float **grad_output_b, + float **columns_b, + float **ones_b, + float **weight_b, + float **grad_weight_b, + float **grad_bias_b, + float *grad_output, + float *columns, + float *ones, + float *weight, + float *grad_weight, + float *grad_bias, + const int grad_output_stride, + const int columns_stride, + const int ones_stride, + const int num_batches) +{ + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_batches) + { + grad_output_b[idx] = grad_output + idx * grad_output_stride; + columns_b[idx] = columns + idx * columns_stride; + ones_b[idx] = ones + idx * ones_stride; + + // share weights and bias within a Mini-Batch + weight_b[idx] = weight; + grad_weight_b[idx] = grad_weight; + grad_bias_b[idx] = grad_bias; + } +} + +std::vector dcn_v2_cuda_backward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const at::Tensor &grad_output, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int deformable_group) +{ + + THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous"); + THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous"); + + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); + AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); + AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); + AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + + const int channels_out = weight.size(0); + const int channels_kernel = weight.size(1); + const int kernel_h_ = weight.size(2); + const int kernel_w_ = weight.size(3); + + AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, + "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); + + AT_ASSERTM(channels == channels_kernel, + "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); + + const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + + auto ones = at::ones({height_out, width_out}, input.options()); + auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); + auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); + + auto grad_input = at::zeros_like(input); + auto grad_weight = at::zeros_like(weight); + auto grad_bias = at::zeros_like(bias); + auto grad_offset = at::zeros_like(offset); + auto grad_mask = at::zeros_like(mask); + + using scalar_t = float; + + for (int b = 0; b < batch; b++) + { + auto input_n = input.select(0, b); + auto offset_n = offset.select(0, b); + auto mask_n = mask.select(0, b); + auto grad_output_n = grad_output.select(0, b); + auto grad_input_n = grad_input.select(0, b); + auto grad_offset_n = grad_offset.select(0, b); + auto grad_mask_n = grad_mask.select(0, b); + + long m = channels * kernel_h * kernel_w; + long n = height_out * width_out; + long k = channels_out; + + THCudaBlas_Sgemm(state, 'n', 't', n, m, k, 1.0f, + grad_output_n.data(), n, + weight.data(), m, 0.0f, + columns.data(), n); + + // gradient w.r.t. input coordinate data + modulated_deformable_col2im_coord_cuda(THCState_getCurrentStream(state), + columns.data(), + input_n.data(), + offset_n.data(), + mask_n.data(), + 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, + grad_offset_n.data(), + grad_mask_n.data()); + // gradient w.r.t. input data + modulated_deformable_col2im_cuda(THCState_getCurrentStream(state), + columns.data(), + offset_n.data(), + mask_n.data(), + 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, + grad_input_n.data()); + + // gradient w.r.t. weight, dWeight should accumulate across the batch and group + modulated_deformable_im2col_cuda(THCState_getCurrentStream(state), + input_n.data(), + offset_n.data(), + mask_n.data(), + 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, + columns.data()); + + long m_ = channels_out; + long n_ = channels * kernel_h * kernel_w; + long k_ = height_out * width_out; + + THCudaBlas_Sgemm(state, 't', 'n', n_, m_, k_, 1.0f, + columns.data(), k_, + grad_output_n.data(), k_, 1.0f, + grad_weight.data(), n_); + + // gradient w.r.t. bias + // long m_ = channels_out; + // long k__ = height_out * width_out; + THCudaBlas_Sgemv(state, + 't', + k_, m_, 1.0f, + grad_output_n.data(), k_, + ones.data(), 1, 1.0f, + grad_bias.data(), 1); + } + + return { + grad_input, grad_offset, grad_mask, grad_weight, grad_bias + }; +} \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_im2col_cuda.cu b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_im2col_cuda.cu new file mode 100644 index 00000000..4140eacf --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_im2col_cuda.cu @@ -0,0 +1,402 @@ +#include "dcn_v2_im2col_cuda.h" +#include +#include +#include + +#include +#include + +#include +#include +#include + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N) +{ + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + + +__device__ float dmcn_im2col_bilinear_cuda(const float *bottom_data, const int data_width, + const int height, const int width, float h, float w) +{ + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h - h_low; + float lw = w - w_low; + float hh = 1 - lh, hw = 1 - lw; + + float v1 = 0; + if (h_low >= 0 && w_low >= 0) + v1 = bottom_data[h_low * data_width + w_low]; + float v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + v2 = bottom_data[h_low * data_width + w_high]; + float v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + v3 = bottom_data[h_high * data_width + w_low]; + float v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + v4 = bottom_data[h_high * data_width + w_high]; + + float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +__device__ float dmcn_get_gradient_weight_cuda(float argmax_h, float argmax_w, + const int h, const int w, const int height, const int width) +{ + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + float weight = 0; + if (h == argmax_h_low && w == argmax_w_low) + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + if (h == argmax_h_low && w == argmax_w_high) + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + if (h == argmax_h_high && w == argmax_w_low) + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + if (h == argmax_h_high && w == argmax_w_high) + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + return weight; +} + +__device__ float dmcn_get_coordinate_weight_cuda(float argmax_h, float argmax_w, + const int height, const int width, const float *im_data, + const int data_width, const int bp_dir) +{ + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + float weight = 0; + + if (bp_dir == 0) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + else if (bp_dir == 1) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + + return weight; +} + +__global__ void modulated_deformable_im2col_gpu_kernel(const int n, + const float *data_im, const float *data_offset, const float *data_mask, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int num_channels, const int deformable_group, + const int height_col, const int width_col, + float *data_col) +{ + // launch channels * batch_size * height_col * width_col cores + CUDA_KERNEL_LOOP(index, n) + { + // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow) + // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis + + // index index of output matrix + const int w_col = index % width_col; + const int h_col = (index / width_col) % height_col; + // const int b_col = (index / width_col / height_col) % batch_size; + const int b_col = (index / width_col / height_col / num_channels) % batch_size; + // const int c_im = (index / width_col / height_col) / batch_size; + const int c_im = (index / width_col / height_col) % num_channels; + // const int c_col = c_im * kernel_h * kernel_w; + const int c_col = c_im * kernel_h * kernel_w; + + // compute deformable group index + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + + // float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col; + //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in; + const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; + const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + + const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) + { + for (int j = 0; j < kernel_w; ++j) + { + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; + const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; + const float offset_h = data_offset_ptr[data_offset_h_ptr]; + const float offset_w = data_offset_ptr[data_offset_w_ptr]; + const float mask = data_mask_ptr[data_mask_hw_ptr]; + float val = static_cast(0); + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) + { + //const float map_h = i * dilation_h + offset_h; + //const float map_w = j * dilation_w + offset_w; + //const int cur_height = height - h_in; + //const int cur_width = width - w_in; + //val = dmcn_im2col_bilinear_cuda(data_im_ptr, width, cur_height, cur_width, map_h, map_w); + val = dmcn_im2col_bilinear_cuda(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val * mask; + // data_col_ptr += batch_size * height_col * width_col; + data_col_ptr += height_col * width_col; + } + } + } +} + +__global__ void modulated_deformable_col2im_gpu_kernel(const int n, + const float *data_col, const float *data_offset, const float *data_mask, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int deformable_group, + const int height_col, const int width_col, + float *grad_im) +{ + CUDA_KERNEL_LOOP(index, n) + { + const int j = (index / width_col / height_col / batch_size) % kernel_w; + const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; + // compute the start and end of the output + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = index % width_col; + int h_out = (index / width_col) % height_col; + int b = (index / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; + const float offset_h = data_offset_ptr[data_offset_h_ptr]; + const float offset_w = data_offset_ptr[data_offset_w_ptr]; + const float mask = data_mask_ptr[data_mask_hw_ptr]; + const float cur_inv_h_data = h_in + i * dilation_h + offset_h; + const float cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const float cur_top_grad = data_col[index] * mask; + const int cur_h = (int)cur_inv_h_data; + const int cur_w = (int)cur_inv_w_data; + for (int dy = -2; dy <= 2; dy++) + { + for (int dx = -2; dx <= 2; dx++) + { + if (cur_h + dy >= 0 && cur_h + dy < height && + cur_w + dx >= 0 && cur_w + dx < width && + abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) + { + int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + float weight = dmcn_get_gradient_weight_cuda(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); + atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + } + } + } + } +} + +__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n, + const float *data_col, const float *data_im, + const float *data_offset, const float *data_mask, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int offset_channels, const int deformable_group, + const int height_col, const int width_col, + float *grad_offset, float *grad_mask) +{ + CUDA_KERNEL_LOOP(index, n) + { + float val = 0, mval = 0; + int w = index % width_col; + int h = (index / width_col) % height_col; + int c = (index / width_col / height_col) % offset_channels; + int b = (index / width_col / height_col) / offset_channels; + // compute the start and end of the output + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; + const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; + const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) + { + const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); + const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); + const float offset_h = data_offset_ptr[data_offset_h_ptr]; + const float offset_w = data_offset_ptr[data_offset_w_ptr]; + const float mask = data_mask_ptr[data_mask_hw_ptr]; + float inv_h = h_in + i * dilation_h + offset_h; + float inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) + { + inv_h = inv_w = -2; + } + else + { + mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear_cuda(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w); + } + const float weight = dmcn_get_coordinate_weight_cuda( + inv_h, inv_w, + height, width, data_im_ptr + cnt * height * width, width, bp_dir); + val += weight * data_col_ptr[col_pos] * mask; + cnt += 1; + } + // KERNEL_ASSIGN(grad_offset[index], offset_req, val); + grad_offset[index] = val; + if (offset_c % 2 == 0) + // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval); + grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval; + } +} + +void modulated_deformable_im2col_cuda(cudaStream_t stream, + const float* data_im, const float* data_offset, const float* data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float* data_col) { + // num_axes should be smaller than block size + const int channel_per_deformable_group = channels / deformable_group; + const int num_kernels = channels * batch_size * height_col * width_col; + modulated_deformable_im2col_gpu_kernel + <<>>( + num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, + batch_size, channels, deformable_group, height_col, width_col, data_col); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); + } + +} + +void modulated_deformable_col2im_cuda(cudaStream_t stream, + const float* data_col, const float* data_offset, const float* data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float* grad_im){ + + const int channel_per_deformable_group = channels / deformable_group; + const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col; + modulated_deformable_col2im_gpu_kernel + <<>>( + num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im, + kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + batch_size, deformable_group, height_col, width_col, grad_im); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); + } + +} + +void modulated_deformable_col2im_coord_cuda(cudaStream_t stream, + const float* data_col, const float* data_im, const float* data_offset, const float* data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, + float* grad_offset, float* grad_mask) { + const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group; + const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group; + modulated_deformable_col2im_coord_gpu_kernel + <<>>( + num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im, + kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, + grad_offset, grad_mask); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err)); + } +} \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_im2col_cuda.h b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_im2col_cuda.h new file mode 100644 index 00000000..c8568319 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_im2col_cuda.h @@ -0,0 +1,101 @@ + +/*! + ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** + * + * COPYRIGHT + * + * All contributions by the University of California: + * Copyright (c) 2014-2017 The Regents of the University of California (Regents) + * All rights reserved. + * + * All other contributions: + * Copyright (c) 2014-2017, the respective contributors + * All rights reserved. + * + * Caffe uses a shared copyright model: each contributor holds copyright over + * their contributions to Caffe. The project versioning records all such + * contribution and copyright details. If a contributor wants to further mark + * their specific copyright on a particular contribution, they should indicate + * their copyright solely in the commit message of the change when it is + * committed. + * + * LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * CONTRIBUTION AGREEMENT + * + * By contributing to the BVLC/caffe repository through pull-request, comment, + * or otherwise, the contributor releases their content to the + * license and copyright terms herein. + * + ***************** END Caffe Copyright Notice and Disclaimer ******************** + * + * Copyright (c) 2018 Microsoft + * Licensed under The MIT License [see LICENSE for details] + * \file modulated_deformable_im2col.h + * \brief Function definitions of converting an image to + * column matrix based on kernel, padding, dilation, and offset. + * These functions are mainly used in deformable convolution operators. + * \ref: https://arxiv.org/abs/1811.11168 + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu + */ + +/***************** Adapted by Charles Shang *********************/ + +#ifndef DCN_V2_IM2COL_CUDA +#define DCN_V2_IM2COL_CUDA + +#ifdef __cplusplus +extern "C" +{ +#endif + + void modulated_deformable_im2col_cuda(cudaStream_t stream, + const float *data_im, const float *data_offset, const float *data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float *data_col); + + void modulated_deformable_col2im_cuda(cudaStream_t stream, + const float *data_col, const float *data_offset, const float *data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float *grad_im); + + void modulated_deformable_col2im_coord_cuda(cudaStream_t stream, + const float *data_col, const float *data_im, const float *data_offset, const float *data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, + float *grad_offset, float *grad_mask); + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_psroi_pooling_cuda.cu b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_psroi_pooling_cuda.cu new file mode 100644 index 00000000..8f08f6a3 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_psroi_pooling_cuda.cu @@ -0,0 +1,419 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The MIT License [see LICENSE for details] + * \file deformable_psroi_pooling.cu + * \brief + * \author Yi Li, Guodong Zhang, Jifeng Dai +*/ +/***************** Adapted by Charles Shang *********************/ + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N) +{ + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +template +__device__ T bilinear_interp_cuda( + const T *data, + const T x, + const T y, + const int width, + const int height) +{ + int x1 = floor(x); + int x2 = ceil(x); + int y1 = floor(y); + int y2 = ceil(y); + T dist_x = static_cast(x - x1); + T dist_y = static_cast(y - y1); + T value11 = data[y1 * width + x1]; + T value12 = data[y2 * width + x1]; + T value21 = data[y1 * width + x2]; + T value22 = data[y2 * width + x2]; + T value = (1 - dist_x) * (1 - dist_y) * value11 + + (1 - dist_x) * dist_y * value12 + + dist_x * (1 - dist_y) * value21 + + dist_x * dist_y * value22; + return value; +} + +template +__global__ void DeformablePSROIPoolForwardKernelCuda( + const int count, + const T *bottom_data, + const T spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const T *bottom_rois, const T *bottom_trans, + const int no_trans, + const T trans_std, + const int sample_per_part, + const int output_dim, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class, + T *top_data, + T *top_count) +{ + CUDA_KERNEL_LOOP(index, count) + { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const T *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + T roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + int part_h = floor(static_cast(ph) / pooled_height * part_size); + int part_w = floor(static_cast(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; + T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; + + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + T sum = 0; + int count = 0; + int gw = floor(static_cast(pw) * group_size / pooled_width); + int gh = floor(static_cast(ph) * group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + + const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width; + for (int ih = 0; ih < sample_per_part; ih++) + { + for (int iw = 0; iw < sample_per_part; iw++) + { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) + { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop * group_size + gh) * group_size + gw; + T val = bilinear_interp_cuda(offset_bottom_data + c * height * width, w, h, width, height); + sum += val; + count++; + } + } + top_data[index] = count == 0 ? static_cast(0) : sum / count; + top_count[index] = count; + } +} + +template +__global__ void DeformablePSROIPoolBackwardAccKernelCuda( + const int count, + const T *top_diff, + const T *top_count, + const int num_rois, + const T spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int output_dim, + T *bottom_data_diff, T *bottom_trans_diff, + const T *bottom_data, + const T *bottom_rois, + const T *bottom_trans, + const int no_trans, + const T trans_std, + const int sample_per_part, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class) +{ + CUDA_KERNEL_LOOP(index, count) + { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const T *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + T roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + int part_h = floor(static_cast(ph) / pooled_height * part_size); + int part_w = floor(static_cast(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; + T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; + + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + if (top_count[index] <= 0) + { + continue; + } + T diff_val = top_diff[index] / top_count[index]; + const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; + T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width; + int gw = floor(static_cast(pw) * group_size / pooled_width); + int gh = floor(static_cast(ph) * group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + + for (int ih = 0; ih < sample_per_part; ih++) + { + for (int iw = 0; iw < sample_per_part; iw++) + { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) + { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop * group_size + gh) * group_size + gw; + // backward on feature + int x0 = floor(w); + int x1 = ceil(w); + int y0 = floor(h); + int y1 = ceil(h); + T dist_x = w - x0, dist_y = h - y0; + T q00 = (1 - dist_x) * (1 - dist_y); + T q01 = (1 - dist_x) * dist_y; + T q10 = dist_x * (1 - dist_y); + T q11 = dist_x * dist_y; + int bottom_index_base = c * height * width; + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val); + + if (no_trans) + { + continue; + } + T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; + T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; + T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; + T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; + T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val; + diff_x *= roi_width; + T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val; + diff_y *= roi_height; + + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x); + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y); + } + } + } +} + +std::tuple +dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor"); + AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + const int channels_trans = no_trans ? 2 : trans.size(1); + const int num_bbox = bbox.size(0); + + AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); + auto pooled_height = pooled_size; + auto pooled_width = pooled_size; + + auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); + long out_size = num_bbox * output_dim * pooled_height * pooled_width; + auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); + + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + if (out.numel() == 0) + { + THCudaCheck(cudaGetLastError()); + return std::make_tuple(out, top_count); + } + + dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); + dim3 block(512); + + AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cuda_forward", [&] { + DeformablePSROIPoolForwardKernelCuda<<>>( + out_size, + input.contiguous().data(), + spatial_scale, + channels, + height, width, + pooled_height, + pooled_width, + bbox.contiguous().data(), + trans.contiguous().data(), + no_trans, + trans_std, + sample_per_part, + output_dim, + group_size, + part_size, + num_classes, + channels_each_class, + out.data(), + top_count.data()); + }); + THCudaCheck(cudaGetLastError()); + return std::make_tuple(out, top_count); +} + +std::tuple +dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad, + const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const at::Tensor &top_count, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor"); + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor"); + AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor"); + AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + const int channels_trans = no_trans ? 2 : trans.size(1); + const int num_bbox = bbox.size(0); + + AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); + auto pooled_height = pooled_size; + auto pooled_width = pooled_size; + long out_size = num_bbox * output_dim * pooled_height * pooled_width; + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options()); + auto trans_grad = at::zeros_like(trans); + + if (input_grad.numel() == 0) + { + THCudaCheck(cudaGetLastError()); + return std::make_tuple(input_grad, trans_grad); + } + + dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); + dim3 block(512); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cuda_backward", [&] { + DeformablePSROIPoolBackwardAccKernelCuda<<>>( + out_size, + out_grad.contiguous().data(), + top_count.contiguous().data(), + num_bbox, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + output_dim, + input_grad.contiguous().data(), + trans_grad.contiguous().data(), + input.contiguous().data(), + bbox.contiguous().data(), + trans.contiguous().data(), + no_trans, + trans_std, + sample_per_part, + group_size, + part_size, + num_classes, + channels_each_class); + }); + THCudaCheck(cudaGetLastError()); + return std::make_tuple(input_grad, trans_grad); +} \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/vision.h b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/vision.h new file mode 100644 index 00000000..e42a2a79 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/vision.h @@ -0,0 +1,60 @@ +#pragma once +#include + +at::Tensor +dcn_v2_cuda_forward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int deformable_group); + +std::vector +dcn_v2_cuda_backward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const at::Tensor &grad_output, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int deformable_group); + + +std::tuple +dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std); + +std::tuple +dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad, + const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const at::Tensor &top_count, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std); \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/dcn_v2.h b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/dcn_v2.h new file mode 100644 index 00000000..de670bf9 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/dcn_v2.h @@ -0,0 +1,190 @@ +#pragma once + +#include "cpu/vision.h" + +#ifdef WITH_CUDA +#include "cuda/vision.h" +#endif + +at::Tensor +dcn_v2_forward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int deformable_group) +{ + if (input.type().is_cuda()) + { +#ifdef WITH_CUDA + return dcn_v2_cuda_forward(input, weight, bias, offset, mask, + kernel_h, kernel_w, + stride_h, stride_w, + pad_h, pad_w, + dilation_h, dilation_w, + deformable_group); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + else{ + return dcn_v2_cpu_forward(input, weight, bias, offset, mask, + kernel_h, kernel_w, + stride_h, stride_w, + pad_h, pad_w, + dilation_h, dilation_w, + deformable_group); + } +} + +std::vector +dcn_v2_backward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const at::Tensor &grad_output, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int deformable_group) +{ + if (input.type().is_cuda()) + { +#ifdef WITH_CUDA + return dcn_v2_cuda_backward(input, + weight, + bias, + offset, + mask, + grad_output, + kernel_h, kernel_w, + stride_h, stride_w, + pad_h, pad_w, + dilation_h, dilation_w, + deformable_group); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + else{ + return dcn_v2_cpu_backward(input, + weight, + bias, + offset, + mask, + grad_output, + kernel_h, kernel_w, + stride_h, stride_w, + pad_h, pad_w, + dilation_h, dilation_w, + deformable_group); + } +} + +std::tuple +dcn_v2_psroi_pooling_forward(const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + if (input.type().is_cuda()) + { +#ifdef WITH_CUDA + return dcn_v2_psroi_pooling_cuda_forward(input, + bbox, + trans, + no_trans, + spatial_scale, + output_dim, + group_size, + pooled_size, + part_size, + sample_per_part, + trans_std); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + else{ + return dcn_v2_psroi_pooling_cpu_forward(input, + bbox, + trans, + no_trans, + spatial_scale, + output_dim, + group_size, + pooled_size, + part_size, + sample_per_part, + trans_std); + } +} + +std::tuple +dcn_v2_psroi_pooling_backward(const at::Tensor &out_grad, + const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const at::Tensor &top_count, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + if (input.type().is_cuda()) + { +#ifdef WITH_CUDA + return dcn_v2_psroi_pooling_cuda_backward(out_grad, + input, + bbox, + trans, + top_count, + no_trans, + spatial_scale, + output_dim, + group_size, + pooled_size, + part_size, + sample_per_part, + trans_std); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + else{ + return dcn_v2_psroi_pooling_cpu_backward(out_grad, + input, + bbox, + trans, + top_count, + no_trans, + spatial_scale, + output_dim, + group_size, + pooled_size, + part_size, + sample_per_part, + trans_std); + } +} \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/vision.cpp b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/vision.cpp new file mode 100644 index 00000000..ff54233e --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/vision.cpp @@ -0,0 +1,9 @@ + +#include "dcn_v2.h" + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("dcn_v2_forward", &dcn_v2_forward, "dcn_v2_forward"); + m.def("dcn_v2_backward", &dcn_v2_backward, "dcn_v2_backward"); + m.def("dcn_v2_psroi_pooling_forward", &dcn_v2_psroi_pooling_forward, "dcn_v2_psroi_pooling_forward"); + m.def("dcn_v2_psroi_pooling_backward", &dcn_v2_psroi_pooling_backward, "dcn_v2_psroi_pooling_backward"); +} diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/testcpu.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/testcpu.py new file mode 100644 index 00000000..0a0b36eb --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/testcpu.py @@ -0,0 +1,270 @@ +#!/usr/bin/env python +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import time +import torch +import torch.nn as nn +from torch.autograd import gradcheck + +from dcn_v2 import dcn_v2_conv, DCNv2, DCN +from dcn_v2 import dcn_v2_pooling, DCNv2Pooling, DCNPooling + +deformable_groups = 1 +N, inC, inH, inW = 2, 2, 4, 4 +outC = 2 +kH, kW = 3, 3 + + +def conv_identify(weight, bias): + weight.data.zero_() + bias.data.zero_() + o, i, h, w = weight.shape + y = h//2 + x = w//2 + for p in range(i): + for q in range(o): + if p == q: + weight.data[q, p, y, x] = 1.0 + + +def check_zero_offset(): + conv_offset = nn.Conv2d(inC, deformable_groups * 2 * kH * kW, + kernel_size=(kH, kW), + stride=(1, 1), + padding=(1, 1), + bias=True) + + conv_mask = nn.Conv2d(inC, deformable_groups * 1 * kH * kW, + kernel_size=(kH, kW), + stride=(1, 1), + padding=(1, 1), + bias=True) + + dcn_v2 = DCNv2(inC, outC, (kH, kW), + stride=1, padding=1, dilation=1, + deformable_groups=deformable_groups) + + conv_offset.weight.data.zero_() + conv_offset.bias.data.zero_() + conv_mask.weight.data.zero_() + conv_mask.bias.data.zero_() + conv_identify(dcn_v2.weight, dcn_v2.bias) + + input = torch.randn(N, inC, inH, inW) + offset = conv_offset(input) + mask = conv_mask(input) + mask = torch.sigmoid(mask) + output = dcn_v2(input, offset, mask) + output *= 2 + d = (input - output).abs().max() + if d < 1e-10: + print('Zero offset passed') + else: + print('Zero offset failed') + print(input) + print(output) + +def check_gradient_dconv(): + + input = torch.rand(N, inC, inH, inW) * 0.01 + input.requires_grad = True + + offset = torch.randn(N, deformable_groups * 2 * kW * kH, inH, inW) * 2 + # offset.data.zero_() + # offset.data -= 0.5 + offset.requires_grad = True + + mask = torch.rand(N, deformable_groups * 1 * kW * kH, inH, inW) + # mask.data.zero_() + mask.requires_grad = True + mask = torch.sigmoid(mask) + + weight = torch.randn(outC, inC, kH, kW) + weight.requires_grad = True + + bias = torch.rand(outC) + bias.requires_grad = True + + stride = 1 + padding = 1 + dilation = 1 + + print('check_gradient_dconv: ', + gradcheck(dcn_v2_conv, (input, offset, mask, weight, bias, + stride, padding, dilation, deformable_groups), + eps=1e-3, atol=1e-4, rtol=1e-2)) + + +def check_pooling_zero_offset(): + + input = torch.randn(2, 16, 64, 64).zero_() + input[0, :, 16:26, 16:26] = 1. + input[1, :, 10:20, 20:30] = 2. + rois = torch.tensor([ + [0, 65, 65, 103, 103], + [1, 81, 41, 119, 79], + ]).float() + pooling = DCNv2Pooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=16, + no_trans=True, + group_size=1, + trans_std=0.0) + + out = pooling(input, rois, input.new()) + s = ', '.join(['%f' % out[i, :, :, :].mean().item() + for i in range(rois.shape[0])]) + print(s) + + dpooling = DCNv2Pooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=16, + no_trans=False, + group_size=1, + trans_std=0.0) + offset = torch.randn(20, 2, 7, 7).zero_() + dout = dpooling(input, rois, offset) + s = ', '.join(['%f' % dout[i, :, :, :].mean().item() + for i in range(rois.shape[0])]) + print(s) + + +def check_gradient_dpooling(): + input = torch.randn(2, 3, 5, 5) * 0.01 + N = 4 + batch_inds = torch.randint(2, (N, 1)).float() + x = torch.rand((N, 1)).float() * 15 + y = torch.rand((N, 1)).float() * 15 + w = torch.rand((N, 1)).float() * 10 + h = torch.rand((N, 1)).float() * 10 + rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) + offset = torch.randn(N, 2, 3, 3) + input.requires_grad = True + offset.requires_grad = True + + spatial_scale = 1.0 / 4 + pooled_size = 3 + output_dim = 3 + no_trans = 0 + group_size = 1 + trans_std = 0.0 + sample_per_part = 4 + part_size = pooled_size + + print('check_gradient_dpooling:', + gradcheck(dcn_v2_pooling, (input, rois, offset, + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size, + part_size, + sample_per_part, + trans_std), + eps=1e-4)) + + +def example_dconv(): + input = torch.randn(2, 64, 128, 128) + # wrap all things (offset and mask) in DCN + dcn = DCN(64, 64, kernel_size=(3, 3), stride=1, + padding=1, deformable_groups=2) + # print(dcn.weight.shape, input.shape) + output = dcn(input) + targert = output.new(*output.size()) + targert.data.uniform_(-0.01, 0.01) + error = (targert - output).mean() + error.backward() + print(output.shape) + + +def example_dpooling(): + input = torch.randn(2, 32, 64, 64) + batch_inds = torch.randint(2, (20, 1)).float() + x = torch.randint(256, (20, 1)).float() + y = torch.randint(256, (20, 1)).float() + w = torch.randint(64, (20, 1)).float() + h = torch.randint(64, (20, 1)).float() + rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) + offset = torch.randn(20, 2, 7, 7) + input.requires_grad = True + offset.requires_grad = True + + # normal roi_align + pooling = DCNv2Pooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=32, + no_trans=True, + group_size=1, + trans_std=0.1) + + # deformable pooling + dpooling = DCNv2Pooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=32, + no_trans=False, + group_size=1, + trans_std=0.1) + + out = pooling(input, rois, offset) + dout = dpooling(input, rois, offset) + print(out.shape) + print(dout.shape) + + target_out = out.new(*out.size()) + target_out.data.uniform_(-0.01, 0.01) + target_dout = dout.new(*dout.size()) + target_dout.data.uniform_(-0.01, 0.01) + e = (target_out - out).mean() + e.backward() + e = (target_dout - dout).mean() + e.backward() + + +def example_mdpooling(): + input = torch.randn(2, 32, 64, 64) + input.requires_grad = True + batch_inds = torch.randint(2, (20, 1)).float() + x = torch.randint(256, (20, 1)).float() + y = torch.randint(256, (20, 1)).float() + w = torch.randint(64, (20, 1)).float() + h = torch.randint(64, (20, 1)).float() + rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) + + # mdformable pooling (V2) + dpooling = DCNPooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=32, + no_trans=False, + group_size=1, + trans_std=0.1, + deform_fc_dim=1024) + + dout = dpooling(input, rois) + target = dout.new(*dout.size()) + target.data.uniform_(-0.1, 0.1) + error = (target - dout).mean() + error.backward() + print(dout.shape) + + +if __name__ == '__main__': + + example_dconv() + example_dpooling() + example_mdpooling() + + check_pooling_zero_offset() + # zero offset check + if inC == outC: + check_zero_offset() + + check_gradient_dpooling() + check_gradient_dconv() + # """ + # ****** Note: backward is not reentrant error may not be a serious problem, + # ****** since the max error is less than 1e-7, + # ****** Still looking for what trigger this problem + # """ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/testcuda.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/testcuda.py new file mode 100644 index 00000000..3bd5bd22 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/testcuda.py @@ -0,0 +1,270 @@ +#!/usr/bin/env python +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import time +import torch +import torch.nn as nn +from torch.autograd import gradcheck + +from dcn_v2 import dcn_v2_conv, DCNv2, DCN +from dcn_v2 import dcn_v2_pooling, DCNv2Pooling, DCNPooling + +deformable_groups = 1 +N, inC, inH, inW = 2, 2, 4, 4 +outC = 2 +kH, kW = 3, 3 + + +def conv_identify(weight, bias): + weight.data.zero_() + bias.data.zero_() + o, i, h, w = weight.shape + y = h//2 + x = w//2 + for p in range(i): + for q in range(o): + if p == q: + weight.data[q, p, y, x] = 1.0 + + +def check_zero_offset(): + conv_offset = nn.Conv2d(inC, deformable_groups * 2 * kH * kW, + kernel_size=(kH, kW), + stride=(1, 1), + padding=(1, 1), + bias=True).cuda() + + conv_mask = nn.Conv2d(inC, deformable_groups * 1 * kH * kW, + kernel_size=(kH, kW), + stride=(1, 1), + padding=(1, 1), + bias=True).cuda() + + dcn_v2 = DCNv2(inC, outC, (kH, kW), + stride=1, padding=1, dilation=1, + deformable_groups=deformable_groups).cuda() + + conv_offset.weight.data.zero_() + conv_offset.bias.data.zero_() + conv_mask.weight.data.zero_() + conv_mask.bias.data.zero_() + conv_identify(dcn_v2.weight, dcn_v2.bias) + + input = torch.randn(N, inC, inH, inW).cuda() + offset = conv_offset(input) + mask = conv_mask(input) + mask = torch.sigmoid(mask) + output = dcn_v2(input, offset, mask) + output *= 2 + d = (input - output).abs().max() + if d < 1e-10: + print('Zero offset passed') + else: + print('Zero offset failed') + print(input) + print(output) + +def check_gradient_dconv(): + + input = torch.rand(N, inC, inH, inW).cuda() * 0.01 + input.requires_grad = True + + offset = torch.randn(N, deformable_groups * 2 * kW * kH, inH, inW).cuda() * 2 + # offset.data.zero_() + # offset.data -= 0.5 + offset.requires_grad = True + + mask = torch.rand(N, deformable_groups * 1 * kW * kH, inH, inW).cuda() + # mask.data.zero_() + mask.requires_grad = True + mask = torch.sigmoid(mask) + + weight = torch.randn(outC, inC, kH, kW).cuda() + weight.requires_grad = True + + bias = torch.rand(outC).cuda() + bias.requires_grad = True + + stride = 1 + padding = 1 + dilation = 1 + + print('check_gradient_dconv: ', + gradcheck(dcn_v2_conv, (input, offset, mask, weight, bias, + stride, padding, dilation, deformable_groups), + eps=1e-3, atol=1e-4, rtol=1e-2)) + + +def check_pooling_zero_offset(): + + input = torch.randn(2, 16, 64, 64).cuda().zero_() + input[0, :, 16:26, 16:26] = 1. + input[1, :, 10:20, 20:30] = 2. + rois = torch.tensor([ + [0, 65, 65, 103, 103], + [1, 81, 41, 119, 79], + ]).cuda().float() + pooling = DCNv2Pooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=16, + no_trans=True, + group_size=1, + trans_std=0.0).cuda() + + out = pooling(input, rois, input.new()) + s = ', '.join(['%f' % out[i, :, :, :].mean().item() + for i in range(rois.shape[0])]) + print(s) + + dpooling = DCNv2Pooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=16, + no_trans=False, + group_size=1, + trans_std=0.0).cuda() + offset = torch.randn(20, 2, 7, 7).cuda().zero_() + dout = dpooling(input, rois, offset) + s = ', '.join(['%f' % dout[i, :, :, :].mean().item() + for i in range(rois.shape[0])]) + print(s) + + +def check_gradient_dpooling(): + input = torch.randn(2, 3, 5, 5).cuda() * 0.01 + N = 4 + batch_inds = torch.randint(2, (N, 1)).cuda().float() + x = torch.rand((N, 1)).cuda().float() * 15 + y = torch.rand((N, 1)).cuda().float() * 15 + w = torch.rand((N, 1)).cuda().float() * 10 + h = torch.rand((N, 1)).cuda().float() * 10 + rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) + offset = torch.randn(N, 2, 3, 3).cuda() + input.requires_grad = True + offset.requires_grad = True + + spatial_scale = 1.0 / 4 + pooled_size = 3 + output_dim = 3 + no_trans = 0 + group_size = 1 + trans_std = 0.0 + sample_per_part = 4 + part_size = pooled_size + + print('check_gradient_dpooling:', + gradcheck(dcn_v2_pooling, (input, rois, offset, + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size, + part_size, + sample_per_part, + trans_std), + eps=1e-4)) + + +def example_dconv(): + input = torch.randn(2, 64, 128, 128).cuda() + # wrap all things (offset and mask) in DCN + dcn = DCN(64, 64, kernel_size=(3, 3), stride=1, + padding=1, deformable_groups=2).cuda() + # print(dcn.weight.shape, input.shape) + output = dcn(input) + targert = output.new(*output.size()) + targert.data.uniform_(-0.01, 0.01) + error = (targert - output).mean() + error.backward() + print(output.shape) + + +def example_dpooling(): + input = torch.randn(2, 32, 64, 64).cuda() + batch_inds = torch.randint(2, (20, 1)).cuda().float() + x = torch.randint(256, (20, 1)).cuda().float() + y = torch.randint(256, (20, 1)).cuda().float() + w = torch.randint(64, (20, 1)).cuda().float() + h = torch.randint(64, (20, 1)).cuda().float() + rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) + offset = torch.randn(20, 2, 7, 7).cuda() + input.requires_grad = True + offset.requires_grad = True + + # normal roi_align + pooling = DCNv2Pooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=32, + no_trans=True, + group_size=1, + trans_std=0.1).cuda() + + # deformable pooling + dpooling = DCNv2Pooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=32, + no_trans=False, + group_size=1, + trans_std=0.1).cuda() + + out = pooling(input, rois, offset) + dout = dpooling(input, rois, offset) + print(out.shape) + print(dout.shape) + + target_out = out.new(*out.size()) + target_out.data.uniform_(-0.01, 0.01) + target_dout = dout.new(*dout.size()) + target_dout.data.uniform_(-0.01, 0.01) + e = (target_out - out).mean() + e.backward() + e = (target_dout - dout).mean() + e.backward() + + +def example_mdpooling(): + input = torch.randn(2, 32, 64, 64).cuda() + input.requires_grad = True + batch_inds = torch.randint(2, (20, 1)).cuda().float() + x = torch.randint(256, (20, 1)).cuda().float() + y = torch.randint(256, (20, 1)).cuda().float() + w = torch.randint(64, (20, 1)).cuda().float() + h = torch.randint(64, (20, 1)).cuda().float() + rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) + + # mdformable pooling (V2) + dpooling = DCNPooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=32, + no_trans=False, + group_size=1, + trans_std=0.1, + deform_fc_dim=1024).cuda() + + dout = dpooling(input, rois) + target = dout.new(*dout.size()) + target.data.uniform_(-0.1, 0.1) + error = (target - dout).mean() + error.backward() + print(dout.shape) + + +if __name__ == '__main__': + + example_dconv() + example_dpooling() + example_mdpooling() + + check_pooling_zero_offset() + # zero offset check + if inC == outC: + check_zero_offset() + + check_gradient_dpooling() + check_gradient_dconv() + # """ + # ****** Note: backward is not reentrant error may not be a serious problem, + # ****** since the max error is less than 1e-7, + # ****** Still looking for what trigger this problem + # """ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/__init__.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/__init__.py new file mode 100644 index 00000000..165e6372 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/__init__.py @@ -0,0 +1,13 @@ +from .functions.deform_conv import deform_conv, modulated_deform_conv +from .functions.deform_pool import deform_roi_pooling +from .modules.deform_conv import (DeformConv, ModulatedDeformConv, + DeformConvPack, ModulatedDeformConvPack) +from .modules.deform_pool import (DeformRoIPooling, DeformRoIPoolingPack, + ModulatedDeformRoIPoolingPack) + +__all__ = [ + 'DeformConv', 'DeformConvPack', 'ModulatedDeformConv', + 'ModulatedDeformConvPack', 'DeformRoIPooling', 'DeformRoIPoolingPack', + 'ModulatedDeformRoIPoolingPack', 'deform_conv', 'modulated_deform_conv', + 'deform_roi_pooling' +] diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/lib.linux-x86_64-3.6/deform_conv_cuda.cpython-36m-x86_64-linux-gnu.so b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/lib.linux-x86_64-3.6/deform_conv_cuda.cpython-36m-x86_64-linux-gnu.so new file mode 100755 index 00000000..a77a9e75 Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/lib.linux-x86_64-3.6/deform_conv_cuda.cpython-36m-x86_64-linux-gnu.so differ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/lib.linux-x86_64-3.6/deform_pool_cuda.cpython-36m-x86_64-linux-gnu.so b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/lib.linux-x86_64-3.6/deform_pool_cuda.cpython-36m-x86_64-linux-gnu.so new file mode 100755 index 00000000..2e336ed9 Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/lib.linux-x86_64-3.6/deform_pool_cuda.cpython-36m-x86_64-linux-gnu.so differ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_conv_cuda.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_conv_cuda.o new file mode 100644 index 00000000..aab12cbd Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_conv_cuda.o differ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_conv_cuda_kernel.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_conv_cuda_kernel.o new file mode 100644 index 00000000..1d23b025 Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_conv_cuda_kernel.o differ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_pool_cuda.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_pool_cuda.o new file mode 100644 index 00000000..517d3e89 Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_pool_cuda.o differ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_pool_cuda_kernel.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_pool_cuda_kernel.o new file mode 100644 index 00000000..591c2ad2 Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_pool_cuda_kernel.o differ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.9/build.ninja b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.9/build.ninja new file mode 100644 index 00000000..b68dcc8c --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.9/build.ninja @@ -0,0 +1,29 @@ +ninja_required_version = 1.3 +cxx = c++ +nvcc = /usr/local/cuda/bin/nvcc + +cflags = -pthread -B /ssd8/exec/huangjy/miniconda3/envs/vgt/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /ssd8/exec/huangjy/miniconda3/envs/vgt/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/include -fPIC -O2 -isystem /ssd8/exec/huangjy/miniconda3/envs/vgt/include -fPIC -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/TH -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/include/python3.9 -c +post_cflags = -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=deform_conv_cuda -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++14 +cuda_cflags = -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/TH -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/include/python3.9 -c +cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=deform_conv_cuda -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 -std=c++14 +ldflags = + +rule compile + command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags + depfile = $out.d + deps = gcc + +rule cuda_compile + depfile = $out.d + deps = gcc + command = $nvcc $cuda_cflags -c $in -o $out $cuda_post_cflags + + + +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/dcn/build/temp.linux-x86_64-3.9/src/deform_conv_cuda.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/dcn/src/deform_conv_cuda.cpp +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/dcn/build/temp.linux-x86_64-3.9/src/deform_conv_cuda_kernel.o: cuda_compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/dcn/src/deform_conv_cuda_kernel.cu + + + + + diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/.ninja_log b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/.ninja_log new file mode 100644 index 00000000..53358401 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/.ninja_log @@ -0,0 +1,2 @@ +# ninja log v5 +0 7662 1710746646902240006 /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda_kernel.o 1473fba0a93f3aa9 diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/build.ninja b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/build.ninja new file mode 100644 index 00000000..08980c4b --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/build.ninja @@ -0,0 +1,27 @@ +ninja_required_version = 1.3 +cxx = c++ +nvcc = /usr/local/cuda/bin/nvcc + +cflags = -pthread -B /ssd8/exec/huangjy/miniconda3/envs/lore/compiler_compat -Wl,--sysroot=/ -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/TH -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/THC -I/usr/local/cuda/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/include/python3.7m -c +post_cflags = -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=deform_conv_cuda -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++14 +cuda_cflags = -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/TH -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/THC -I/usr/local/cuda/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/include/python3.7m -c +cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=deform_conv_cuda -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_75,code=sm_75 -std=c++14 +ldflags = + +rule compile + command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags + depfile = $out.d + deps = gcc + +rule cuda_compile + command = $nvcc $cuda_cflags -c $in -o $out $cuda_post_cflags + + + +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/dcn/src/deform_conv_cuda.cpp +build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda_kernel.o: cuda_compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/dcn/src/deform_conv_cuda_kernel.cu + + + + + diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda_kernel.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda_kernel.o new file mode 100644 index 00000000..bb82822a Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda_kernel.o differ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/deform_conv_cuda.cpython-36m-x86_64-linux-gnu.so b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/deform_conv_cuda.cpython-36m-x86_64-linux-gnu.so new file mode 100755 index 00000000..a77a9e75 Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/deform_conv_cuda.cpython-36m-x86_64-linux-gnu.so differ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/deform_pool_cuda.cpython-36m-x86_64-linux-gnu.so b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/deform_pool_cuda.cpython-36m-x86_64-linux-gnu.so new file mode 100755 index 00000000..2e336ed9 Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/deform_pool_cuda.cpython-36m-x86_64-linux-gnu.so differ diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/functions/__init__.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/functions/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/functions/deform_conv.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/functions/deform_conv.py new file mode 100644 index 00000000..e82b0746 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/functions/deform_conv.py @@ -0,0 +1,187 @@ +import torch +from torch.autograd import Function +from torch.nn.modules.utils import _pair + +from .. import deform_conv_cuda + + +class DeformConvFunction(Function): + + @staticmethod + def forward(ctx, + input, + offset, + weight, + stride=1, + padding=0, + dilation=1, + groups=1, + deformable_groups=1, + im2col_step=64): + if input is not None and input.dim() != 4: + raise ValueError( + "Expected 4D tensor as input, got {}D tensor instead.".format( + input.dim())) + ctx.stride = _pair(stride) + ctx.padding = _pair(padding) + ctx.dilation = _pair(dilation) + ctx.groups = groups + ctx.deformable_groups = deformable_groups + ctx.im2col_step = im2col_step + + ctx.save_for_backward(input, offset, weight) + + output = input.new_empty( + DeformConvFunction._output_size(input, weight, ctx.padding, + ctx.dilation, ctx.stride)) + + ctx.bufs_ = [input.new_empty(0), input.new_empty(0)] # columns, ones + + if not input.is_cuda: + raise NotImplementedError + else: + cur_im2col_step = min(ctx.im2col_step, input.shape[0]) + assert (input.shape[0] % + cur_im2col_step) == 0, 'im2col step must divide batchsize' + deform_conv_cuda.deform_conv_forward_cuda( + input, weight, offset, output, ctx.bufs_[0], ctx.bufs_[1], + weight.size(3), weight.size(2), ctx.stride[1], ctx.stride[0], + ctx.padding[1], ctx.padding[0], ctx.dilation[1], + ctx.dilation[0], ctx.groups, ctx.deformable_groups, + cur_im2col_step) + return output + + @staticmethod + def backward(ctx, grad_output): + input, offset, weight = ctx.saved_tensors + + grad_input = grad_offset = grad_weight = None + + if not grad_output.is_cuda: + raise NotImplementedError + else: + cur_im2col_step = min(ctx.im2col_step, input.shape[0]) + assert (input.shape[0] % + cur_im2col_step) == 0, 'im2col step must divide batchsize' + + if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]: + grad_input = torch.zeros_like(input) + grad_offset = torch.zeros_like(offset) + deform_conv_cuda.deform_conv_backward_input_cuda( + input, offset, grad_output, grad_input, + grad_offset, weight, ctx.bufs_[0], weight.size(3), + weight.size(2), ctx.stride[1], ctx.stride[0], + ctx.padding[1], ctx.padding[0], ctx.dilation[1], + ctx.dilation[0], ctx.groups, ctx.deformable_groups, + cur_im2col_step) + + if ctx.needs_input_grad[2]: + grad_weight = torch.zeros_like(weight) + deform_conv_cuda.deform_conv_backward_parameters_cuda( + input, offset, grad_output, + grad_weight, ctx.bufs_[0], ctx.bufs_[1], weight.size(3), + weight.size(2), ctx.stride[1], ctx.stride[0], + ctx.padding[1], ctx.padding[0], ctx.dilation[1], + ctx.dilation[0], ctx.groups, ctx.deformable_groups, 1, + cur_im2col_step) + + return (grad_input, grad_offset, grad_weight, None, None, None, None, + None) + + @staticmethod + def _output_size(input, weight, padding, dilation, stride): + channels = weight.size(0) + output_size = (input.size(0), channels) + for d in range(input.dim() - 2): + in_size = input.size(d + 2) + pad = padding[d] + kernel = dilation[d] * (weight.size(d + 2) - 1) + 1 + stride_ = stride[d] + output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, ) + if not all(map(lambda s: s > 0, output_size)): + raise ValueError( + "convolution input is too small (output would be {})".format( + 'x'.join(map(str, output_size)))) + return output_size + + +class ModulatedDeformConvFunction(Function): + + @staticmethod + def symbolic(g, input, offset, mask, weight, bias,stride,padding,dilation,groups,deformable_groups): + return g.op("DCNv2", input, offset, mask, weight, bias, + stride_i = stride,padding_i = padding,dilation_i = dilation, + groups_i = groups,deformable_group_i = deformable_groups) + + @staticmethod + def forward(ctx, + input, + offset, + mask, + weight, + bias=None, + stride=1, + padding=0, + dilation=1, + groups=1, + deformable_groups=1): + ctx.stride = stride + ctx.padding = padding + ctx.dilation = dilation + ctx.groups = groups + ctx.deformable_groups = deformable_groups + ctx.with_bias = bias is not None + if not ctx.with_bias: + bias = input.new_empty(1) # fake tensor + if not input.is_cuda: + raise NotImplementedError + if weight.requires_grad or mask.requires_grad or offset.requires_grad \ + or input.requires_grad: + ctx.save_for_backward(input, offset, mask, weight, bias) + output = input.new_empty( + ModulatedDeformConvFunction._infer_shape(ctx, input, weight)) + ctx._bufs = [input.new_empty(0), input.new_empty(0)] + deform_conv_cuda.modulated_deform_conv_cuda_forward( + input, weight, bias, ctx._bufs[0], offset, mask, output, + ctx._bufs[1], weight.shape[2], weight.shape[3], ctx.stride, + ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation, + ctx.groups, ctx.deformable_groups, ctx.with_bias) + return output + + @staticmethod + def backward(ctx, grad_output): + if not grad_output.is_cuda: + raise NotImplementedError + input, offset, mask, weight, bias = ctx.saved_tensors + grad_input = torch.zeros_like(input) + grad_offset = torch.zeros_like(offset) + grad_mask = torch.zeros_like(mask) + grad_weight = torch.zeros_like(weight) + grad_bias = torch.zeros_like(bias) + deform_conv_cuda.modulated_deform_conv_cuda_backward( + input, weight, bias, ctx._bufs[0], offset, mask, ctx._bufs[1], + grad_input, grad_weight, grad_bias, grad_offset, grad_mask, + grad_output, weight.shape[2], weight.shape[3], ctx.stride, + ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation, + ctx.groups, ctx.deformable_groups, ctx.with_bias) + if not ctx.with_bias: + grad_bias = None + + return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias, + None, None, None, None, None) + + @staticmethod + def _infer_shape(ctx, input, weight): + n = input.size(0) + channels_out = weight.size(0) + height, width = input.shape[2:4] + kernel_h, kernel_w = weight.shape[2:4] + height_out = (height + 2 * ctx.padding - + (ctx.dilation * (kernel_h - 1) + 1)) // ctx.stride + 1 + width_out = (width + 2 * ctx.padding - + (ctx.dilation * (kernel_w - 1) + 1)) // ctx.stride + 1 + return n, channels_out, height_out, width_out + + +deform_conv = DeformConvFunction.apply +modulated_deform_conv = ModulatedDeformConvFunction.apply diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/functions/deform_pool.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/functions/deform_pool.py new file mode 100644 index 00000000..65ff0efb --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/functions/deform_pool.py @@ -0,0 +1,69 @@ +import torch +from torch.autograd import Function + +from .. import deform_pool_cuda + + +class DeformRoIPoolingFunction(Function): + + @staticmethod + def forward(ctx, + data, + rois, + offset, + spatial_scale, + out_size, + out_channels, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=.0): + ctx.spatial_scale = spatial_scale + ctx.out_size = out_size + ctx.out_channels = out_channels + ctx.no_trans = no_trans + ctx.group_size = group_size + ctx.part_size = out_size if part_size is None else part_size + ctx.sample_per_part = sample_per_part + ctx.trans_std = trans_std + + assert 0.0 <= ctx.trans_std <= 1.0 + if not data.is_cuda: + raise NotImplementedError + + n = rois.shape[0] + output = data.new_empty(n, out_channels, out_size, out_size) + output_count = data.new_empty(n, out_channels, out_size, out_size) + deform_pool_cuda.deform_psroi_pooling_cuda_forward( + data, rois, offset, output, output_count, ctx.no_trans, + ctx.spatial_scale, ctx.out_channels, ctx.group_size, ctx.out_size, + ctx.part_size, ctx.sample_per_part, ctx.trans_std) + + if data.requires_grad or rois.requires_grad or offset.requires_grad: + ctx.save_for_backward(data, rois, offset) + ctx.output_count = output_count + + return output + + @staticmethod + def backward(ctx, grad_output): + if not grad_output.is_cuda: + raise NotImplementedError + + data, rois, offset = ctx.saved_tensors + output_count = ctx.output_count + grad_input = torch.zeros_like(data) + grad_rois = None + grad_offset = torch.zeros_like(offset) + + deform_pool_cuda.deform_psroi_pooling_cuda_backward( + grad_output, data, rois, offset, output_count, grad_input, + grad_offset, ctx.no_trans, ctx.spatial_scale, ctx.out_channels, + ctx.group_size, ctx.out_size, ctx.part_size, ctx.sample_per_part, + ctx.trans_std) + return (grad_input, grad_rois, grad_offset, None, None, None, None, + None, None, None, None) + + +deform_roi_pooling = DeformRoIPoolingFunction.apply diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/modules/__init__.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/modules/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/modules/deform_conv.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/modules/deform_conv.py new file mode 100644 index 00000000..c0b8286d --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/modules/deform_conv.py @@ -0,0 +1,158 @@ +import math + +import torch +import torch.nn as nn +from torch.nn.modules.utils import _pair + +from ..functions.deform_conv import deform_conv, modulated_deform_conv + + +class DeformConv(nn.Module): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + deformable_groups=1, + bias=False): + super(DeformConv, self).__init__() + + assert not bias + assert in_channels % groups == 0, \ + 'in_channels {} cannot be divisible by groups {}'.format( + in_channels, groups) + assert out_channels % groups == 0, \ + 'out_channels {} cannot be divisible by groups {}'.format( + out_channels, groups) + + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _pair(kernel_size) + self.stride = _pair(stride) + self.padding = _pair(padding) + self.dilation = _pair(dilation) + self.groups = groups + self.deformable_groups = deformable_groups + + self.weight = nn.Parameter( + torch.Tensor(out_channels, in_channels // self.groups, + *self.kernel_size)) + + self.reset_parameters() + + def reset_parameters(self): + n = self.in_channels + for k in self.kernel_size: + n *= k + stdv = 1. / math.sqrt(n) + self.weight.data.uniform_(-stdv, stdv) + + def forward(self, x, offset): + return deform_conv(x, offset, self.weight, self.stride, self.padding, + self.dilation, self.groups, self.deformable_groups) + + +class DeformConvPack(DeformConv): + + def __init__(self, *args, **kwargs): + super(DeformConvPack, self).__init__(*args, **kwargs) + + self.conv_offset = nn.Conv2d( + self.in_channels, + self.deformable_groups * 2 * self.kernel_size[0] * + self.kernel_size[1], + kernel_size=self.kernel_size, + stride=_pair(self.stride), + padding=_pair(self.padding), + bias=True) + self.init_offset() + + def init_offset(self): + self.conv_offset.weight.data.zero_() + self.conv_offset.bias.data.zero_() + + def forward(self, x): + offset = self.conv_offset(x) + return deform_conv(x, offset, self.weight, self.stride, self.padding, + self.dilation, self.groups, self.deformable_groups) + + +class ModulatedDeformConv(nn.Module): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + deformable_groups=1, + bias=True): + super(ModulatedDeformConv, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _pair(kernel_size) + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + self.deformable_groups = deformable_groups + self.with_bias = bias + + self.weight = nn.Parameter( + torch.Tensor(out_channels, in_channels // groups, + *self.kernel_size)) + if bias: + self.bias = nn.Parameter(torch.Tensor(out_channels)) + else: + self.register_parameter('bias', None) + self.reset_parameters() + + def reset_parameters(self): + n = self.in_channels + for k in self.kernel_size: + n *= k + stdv = 1. / math.sqrt(n) + self.weight.data.uniform_(-stdv, stdv) + if self.bias is not None: + self.bias.data.zero_() + + def forward(self, x, offset, mask): + return modulated_deform_conv(x, offset, mask, self.weight, self.bias, + self.stride, self.padding, self.dilation, + self.groups, self.deformable_groups) + + +class ModulatedDeformConvPack(ModulatedDeformConv): + + def __init__(self, *args, **kwargs): + super(ModulatedDeformConvPack, self).__init__(*args, **kwargs) + + self.conv_offset_mask = nn.Conv2d( + self.in_channels, + self.deformable_groups * 3 * self.kernel_size[0] * + self.kernel_size[1], + kernel_size=self.kernel_size, + stride=_pair(self.stride), + padding=_pair(self.padding), + bias=True) + self.init_offset() + + def init_offset(self): + self.conv_offset_mask.weight.data.zero_() + self.conv_offset_mask.bias.data.zero_() + + def forward(self, x): + + out = self.conv_offset_mask(x) + o1, o2, mask = torch.chunk(out, 3, dim=1) + offset = torch.cat((o1, o2), dim=1) + mask = torch.sigmoid(mask) + return modulated_deform_conv(x, offset, mask, self.weight, self.bias, + self.stride, self.padding, self.dilation, + self.groups, self.deformable_groups) diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/modules/deform_pool.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/modules/deform_pool.py new file mode 100644 index 00000000..5e019675 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/modules/deform_pool.py @@ -0,0 +1,172 @@ +from torch import nn + +from ..functions.deform_pool import deform_roi_pooling + + +class DeformRoIPooling(nn.Module): + + def __init__(self, + spatial_scale, + out_size, + out_channels, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=.0): + super(DeformRoIPooling, self).__init__() + self.spatial_scale = spatial_scale + self.out_size = out_size + self.out_channels = out_channels + self.no_trans = no_trans + self.group_size = group_size + self.part_size = out_size if part_size is None else part_size + self.sample_per_part = sample_per_part + self.trans_std = trans_std + + def forward(self, data, rois, offset): + if self.no_trans: + offset = data.new_empty(0) + return deform_roi_pooling( + data, rois, offset, self.spatial_scale, self.out_size, + self.out_channels, self.no_trans, self.group_size, self.part_size, + self.sample_per_part, self.trans_std) + + +class DeformRoIPoolingPack(DeformRoIPooling): + + def __init__(self, + spatial_scale, + out_size, + out_channels, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=.0, + num_offset_fcs=3, + deform_fc_channels=1024): + super(DeformRoIPoolingPack, + self).__init__(spatial_scale, out_size, out_channels, no_trans, + group_size, part_size, sample_per_part, trans_std) + + self.num_offset_fcs = num_offset_fcs + self.deform_fc_channels = deform_fc_channels + + if not no_trans: + seq = [] + ic = self.out_size * self.out_size * self.out_channels + for i in range(self.num_offset_fcs): + if i < self.num_offset_fcs - 1: + oc = self.deform_fc_channels + else: + oc = self.out_size * self.out_size * 2 + seq.append(nn.Linear(ic, oc)) + ic = oc + if i < self.num_offset_fcs - 1: + seq.append(nn.ReLU(inplace=True)) + self.offset_fc = nn.Sequential(*seq) + self.offset_fc[-1].weight.data.zero_() + self.offset_fc[-1].bias.data.zero_() + + def forward(self, data, rois): + assert data.size(1) == self.out_channels + if self.no_trans: + offset = data.new_empty(0) + return deform_roi_pooling( + data, rois, offset, self.spatial_scale, self.out_size, + self.out_channels, self.no_trans, self.group_size, + self.part_size, self.sample_per_part, self.trans_std) + else: + n = rois.shape[0] + offset = data.new_empty(0) + x = deform_roi_pooling(data, rois, offset, self.spatial_scale, + self.out_size, self.out_channels, True, + self.group_size, self.part_size, + self.sample_per_part, self.trans_std) + offset = self.offset_fc(x.view(n, -1)) + offset = offset.view(n, 2, self.out_size, self.out_size) + return deform_roi_pooling( + data, rois, offset, self.spatial_scale, self.out_size, + self.out_channels, self.no_trans, self.group_size, + self.part_size, self.sample_per_part, self.trans_std) + + +class ModulatedDeformRoIPoolingPack(DeformRoIPooling): + + def __init__(self, + spatial_scale, + out_size, + out_channels, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=.0, + num_offset_fcs=3, + num_mask_fcs=2, + deform_fc_channels=1024): + super(ModulatedDeformRoIPoolingPack, self).__init__( + spatial_scale, out_size, out_channels, no_trans, group_size, + part_size, sample_per_part, trans_std) + + self.num_offset_fcs = num_offset_fcs + self.num_mask_fcs = num_mask_fcs + self.deform_fc_channels = deform_fc_channels + + if not no_trans: + offset_fc_seq = [] + ic = self.out_size * self.out_size * self.out_channels + for i in range(self.num_offset_fcs): + if i < self.num_offset_fcs - 1: + oc = self.deform_fc_channels + else: + oc = self.out_size * self.out_size * 2 + offset_fc_seq.append(nn.Linear(ic, oc)) + ic = oc + if i < self.num_offset_fcs - 1: + offset_fc_seq.append(nn.ReLU(inplace=True)) + self.offset_fc = nn.Sequential(*offset_fc_seq) + self.offset_fc[-1].weight.data.zero_() + self.offset_fc[-1].bias.data.zero_() + + mask_fc_seq = [] + ic = self.out_size * self.out_size * self.out_channels + for i in range(self.num_mask_fcs): + if i < self.num_mask_fcs - 1: + oc = self.deform_fc_channels + else: + oc = self.out_size * self.out_size + mask_fc_seq.append(nn.Linear(ic, oc)) + ic = oc + if i < self.num_mask_fcs - 1: + mask_fc_seq.append(nn.ReLU(inplace=True)) + else: + mask_fc_seq.append(nn.Sigmoid()) + self.mask_fc = nn.Sequential(*mask_fc_seq) + self.mask_fc[-2].weight.data.zero_() + self.mask_fc[-2].bias.data.zero_() + + def forward(self, data, rois): + assert data.size(1) == self.out_channels + if self.no_trans: + offset = data.new_empty(0) + return deform_roi_pooling( + data, rois, offset, self.spatial_scale, self.out_size, + self.out_channels, self.no_trans, self.group_size, + self.part_size, self.sample_per_part, self.trans_std) + else: + n = rois.shape[0] + offset = data.new_empty(0) + x = deform_roi_pooling(data, rois, offset, self.spatial_scale, + self.out_size, self.out_channels, True, + self.group_size, self.part_size, + self.sample_per_part, self.trans_std) + offset = self.offset_fc(x.view(n, -1)) + offset = offset.view(n, 2, self.out_size, self.out_size) + mask = self.mask_fc(x.view(n, -1)) + mask = mask.view(n, 1, self.out_size, self.out_size) + return deform_roi_pooling( + data, rois, offset, self.spatial_scale, self.out_size, + self.out_channels, self.no_trans, self.group_size, + self.part_size, self.sample_per_part, self.trans_std) * mask diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/setup.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/setup.py new file mode 100644 index 00000000..96380181 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/setup.py @@ -0,0 +1,15 @@ +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +setup( + name='deform_conv', + ext_modules=[ + CUDAExtension('deform_conv_cuda', [ + 'src/deform_conv_cuda.cpp', + 'src/deform_conv_cuda_kernel.cu', + ]), + CUDAExtension( + 'deform_pool_cuda', + ['src/deform_pool_cuda.cpp', 'src/deform_pool_cuda_kernel.cu']), + ], + cmdclass={'build_ext': BuildExtension}) diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_conv_cuda.cpp b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_conv_cuda.cpp new file mode 100644 index 00000000..c4563ed8 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_conv_cuda.cpp @@ -0,0 +1,695 @@ +// modify from +// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c + +#include + +#include +#include + +void deformable_im2col(const at::Tensor data_im, const at::Tensor data_offset, + const int channels, const int height, const int width, + const int ksize_h, const int ksize_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int parallel_imgs, const int deformable_group, + at::Tensor data_col); + +void deformable_col2im(const at::Tensor data_col, const at::Tensor data_offset, + const int channels, const int height, const int width, + const int ksize_h, const int ksize_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int parallel_imgs, const int deformable_group, + at::Tensor grad_im); + +void deformable_col2im_coord( + const at::Tensor data_col, const at::Tensor data_im, + const at::Tensor data_offset, const int channels, const int height, + const int width, const int ksize_h, const int ksize_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, const int parallel_imgs, + const int deformable_group, at::Tensor grad_offset); + +void modulated_deformable_im2col_cuda( + const at::Tensor data_im, const at::Tensor data_offset, + const at::Tensor data_mask, const int batch_size, const int channels, + const int height_im, const int width_im, const int height_col, + const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, const int deformable_group, + at::Tensor data_col); + +void modulated_deformable_col2im_cuda( + const at::Tensor data_col, const at::Tensor data_offset, + const at::Tensor data_mask, const int batch_size, const int channels, + const int height_im, const int width_im, const int height_col, + const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, const int deformable_group, + at::Tensor grad_im); + +void modulated_deformable_col2im_coord_cuda( + const at::Tensor data_col, const at::Tensor data_im, + const at::Tensor data_offset, const at::Tensor data_mask, + const int batch_size, const int channels, const int height_im, + const int width_im, const int height_col, const int width_col, + const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int dilation_h, + const int dilation_w, const int deformable_group, at::Tensor grad_offset, + at::Tensor grad_mask); + +void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput, + at::Tensor weight, int kH, int kW, int dH, int dW, int padH, + int padW, int dilationH, int dilationW, int group, + int deformable_group) { + AT_CHECK(weight.ndimension() == 4, + "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, " + "but got: %s", + weight.ndimension()); + + AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); + + AT_CHECK(kW > 0 && kH > 0, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, + kW); + + AT_CHECK((weight.size(2) == kH && weight.size(3) == kW), + "kernel size should be consistent with weight, ", + "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH, + kW, weight.size(2), weight.size(3)); + + AT_CHECK(dW > 0 && dH > 0, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + + AT_CHECK( + dilationW > 0 && dilationH > 0, + "dilation should be greater than 0, but got dilationH: %d dilationW: %d", + dilationH, dilationW); + + int ndim = input.ndimension(); + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + AT_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s", + ndim); + + long nInputPlane = weight.size(1) * group; + long inputHeight = input.size(dimh); + long inputWidth = input.size(dimw); + long nOutputPlane = weight.size(0); + long outputHeight = + (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; + long outputWidth = + (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; + + AT_CHECK(nInputPlane % deformable_group == 0, + "input channels must divide deformable group size"); + + if (outputWidth < 1 || outputHeight < 1) + AT_ERROR( + "Given input size: (%ld x %ld x %ld). " + "Calculated output size: (%ld x %ld x %ld). Output size is too small", + nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight, + outputWidth); + + AT_CHECK(input.size(1) == nInputPlane, + "invalid number of input planes, expected: %d, but got: %d", + nInputPlane, input.size(1)); + + AT_CHECK((inputHeight >= kH && inputWidth >= kW), + "input image is smaller than kernel"); + + AT_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth), + "invalid spatial size of offset, expected height: %d width: %d, but " + "got height: %d width: %d", + outputHeight, outputWidth, offset.size(2), offset.size(3)); + + AT_CHECK((offset.size(1) == deformable_group * 2 * kH * kW), + "invalid number of channels of offset"); + + if (gradOutput != NULL) { + AT_CHECK(gradOutput->size(dimf) == nOutputPlane, + "invalid number of gradOutput planes, expected: %d, but got: %d", + nOutputPlane, gradOutput->size(dimf)); + + AT_CHECK((gradOutput->size(dimh) == outputHeight && + gradOutput->size(dimw) == outputWidth), + "invalid size of gradOutput, expected height: %d width: %d , but " + "got height: %d width: %d", + outputHeight, outputWidth, gradOutput->size(dimh), + gradOutput->size(dimw)); + } +} + +int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight, + at::Tensor offset, at::Tensor output, + at::Tensor columns, at::Tensor ones, int kW, + int kH, int dW, int dH, int padW, int padH, + int dilationW, int dilationH, int group, + int deformable_group, int im2col_step) { + // todo: resize columns to include im2col: done + // todo: add im2col_step as input + // todo: add new output buffer and transpose it to output (or directly + // transpose output) todo: possibly change data indexing because of + // parallel_imgs + + shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH, padW, + dilationH, dilationW, group, deformable_group); + + input = input.contiguous(); + offset = offset.contiguous(); + weight = weight.contiguous(); + + int batch = 1; + if (input.ndimension() == 3) { + // Force batch + batch = 0; + input.unsqueeze_(0); + offset.unsqueeze_(0); + } + + // todo: assert batchsize dividable by im2col_step + + long batchSize = input.size(0); + long nInputPlane = input.size(1); + long inputHeight = input.size(2); + long inputWidth = input.size(3); + + long nOutputPlane = weight.size(0); + + long outputWidth = + (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = + (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset"); + + output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane, + outputHeight, outputWidth}); + columns = at::zeros( + {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, + input.options()); + + if (ones.ndimension() != 2 || + ones.size(0) * ones.size(1) < outputHeight * outputWidth) { + ones = at::ones({outputHeight, outputWidth}, input.options()); + } + + input = input.view({batchSize / im2col_step, im2col_step, nInputPlane, + inputHeight, inputWidth}); + offset = + offset.view({batchSize / im2col_step, im2col_step, + deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + + at::Tensor output_buffer = + at::zeros({batchSize / im2col_step, nOutputPlane, + im2col_step * outputHeight, outputWidth}, + output.options()); + + output_buffer = output_buffer.view( + {output_buffer.size(0), group, output_buffer.size(1) / group, + output_buffer.size(2), output_buffer.size(3)}); + + for (int elt = 0; elt < batchSize / im2col_step; elt++) { + deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight, + inputWidth, kH, kW, padH, padW, dH, dW, dilationH, + dilationW, im2col_step, deformable_group, columns); + + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + weight = weight.view({group, weight.size(0) / group, weight.size(1), + weight.size(2), weight.size(3)}); + + for (int g = 0; g < group; g++) { + output_buffer[elt][g] = output_buffer[elt][g] + .flatten(1) + .addmm_(weight[g].flatten(1), columns[g]) + .view_as(output_buffer[elt][g]); + } + } + + output_buffer = output_buffer.view( + {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2), + output_buffer.size(3), output_buffer.size(4)}); + + output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane, + im2col_step, outputHeight, outputWidth}); + output_buffer.transpose_(1, 2); + output.copy_(output_buffer); + output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth}); + + input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); + offset = offset.view( + {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + + if (batch == 0) { + output = output.view({nOutputPlane, outputHeight, outputWidth}); + input = input.view({nInputPlane, inputHeight, inputWidth}); + offset = offset.view({offset.size(1), offset.size(2), offset.size(3)}); + } + + return 1; +} + +int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset, + at::Tensor gradOutput, at::Tensor gradInput, + at::Tensor gradOffset, at::Tensor weight, + at::Tensor columns, int kW, int kH, int dW, + int dH, int padW, int padH, int dilationW, + int dilationH, int group, + int deformable_group, int im2col_step) { + shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW, padH, padW, + dilationH, dilationW, group, deformable_group); + + input = input.contiguous(); + offset = offset.contiguous(); + gradOutput = gradOutput.contiguous(); + weight = weight.contiguous(); + + int batch = 1; + + if (input.ndimension() == 3) { + // Force batch + batch = 0; + input = input.view({1, input.size(0), input.size(1), input.size(2)}); + offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)}); + gradOutput = gradOutput.view( + {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)}); + } + + long batchSize = input.size(0); + long nInputPlane = input.size(1); + long inputHeight = input.size(2); + long inputWidth = input.size(3); + + long nOutputPlane = weight.size(0); + + long outputWidth = + (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = + (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + AT_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset"); + gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth}); + columns = at::zeros( + {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, + input.options()); + + // change order of grad output + gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step, + nOutputPlane, outputHeight, outputWidth}); + gradOutput.transpose_(1, 2); + + gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane, + inputHeight, inputWidth}); + input = input.view({batchSize / im2col_step, im2col_step, nInputPlane, + inputHeight, inputWidth}); + gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step, + deformable_group * 2 * kH * kW, outputHeight, + outputWidth}); + offset = + offset.view({batchSize / im2col_step, im2col_step, + deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + + for (int elt = 0; elt < batchSize / im2col_step; elt++) { + // divide into groups + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + weight = weight.view({group, weight.size(0) / group, weight.size(1), + weight.size(2), weight.size(3)}); + gradOutput = gradOutput.view( + {gradOutput.size(0), group, gradOutput.size(1) / group, + gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)}); + + for (int g = 0; g < group; g++) { + columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1), + gradOutput[elt][g].flatten(1), 0.0f, 1.0f); + } + + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + gradOutput = gradOutput.view( + {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2), + gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)}); + + deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane, + inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, + dilationH, dilationW, im2col_step, deformable_group, + gradOffset[elt]); + + deformable_col2im(columns, offset[elt], nInputPlane, inputHeight, + inputWidth, kH, kW, padH, padW, dH, dW, dilationH, + dilationW, im2col_step, deformable_group, gradInput[elt]); + } + + gradOutput.transpose_(1, 2); + gradOutput = + gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth}); + + gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth}); + input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); + gradOffset = gradOffset.view( + {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + offset = offset.view( + {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + + if (batch == 0) { + gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth}); + input = input.view({nInputPlane, inputHeight, inputWidth}); + gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth}); + offset = offset.view({offset.size(1), offset.size(2), offset.size(3)}); + gradOffset = + gradOffset.view({offset.size(1), offset.size(2), offset.size(3)}); + } + + return 1; +} + +int deform_conv_backward_parameters_cuda( + at::Tensor input, at::Tensor offset, at::Tensor gradOutput, + at::Tensor gradWeight, // at::Tensor gradBias, + at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH, + int padW, int padH, int dilationW, int dilationH, int group, + int deformable_group, float scale, int im2col_step) { + // todo: transpose and reshape outGrad + // todo: reshape columns + // todo: add im2col_step as input + + shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH, dW, padH, + padW, dilationH, dilationW, group, deformable_group); + + input = input.contiguous(); + offset = offset.contiguous(); + gradOutput = gradOutput.contiguous(); + + int batch = 1; + + if (input.ndimension() == 3) { + // Force batch + batch = 0; + input = input.view( + at::IntList({1, input.size(0), input.size(1), input.size(2)})); + gradOutput = gradOutput.view( + {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)}); + } + + long batchSize = input.size(0); + long nInputPlane = input.size(1); + long inputHeight = input.size(2); + long inputWidth = input.size(3); + + long nOutputPlane = gradWeight.size(0); + + long outputWidth = + (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = + (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset"); + + columns = at::zeros( + {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, + input.options()); + + gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step, + nOutputPlane, outputHeight, outputWidth}); + gradOutput.transpose_(1, 2); + + at::Tensor gradOutputBuffer = at::zeros_like(gradOutput); + gradOutputBuffer = + gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step, + outputHeight, outputWidth}); + gradOutputBuffer.copy_(gradOutput); + gradOutputBuffer = + gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, + im2col_step * outputHeight, outputWidth}); + + gradOutput.transpose_(1, 2); + gradOutput = + gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth}); + + input = input.view({batchSize / im2col_step, im2col_step, nInputPlane, + inputHeight, inputWidth}); + offset = + offset.view({batchSize / im2col_step, im2col_step, + deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + + for (int elt = 0; elt < batchSize / im2col_step; elt++) { + deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight, + inputWidth, kH, kW, padH, padW, dH, dW, dilationH, + dilationW, im2col_step, deformable_group, columns); + + // divide into group + gradOutputBuffer = gradOutputBuffer.view( + {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group, + gradOutputBuffer.size(2), gradOutputBuffer.size(3)}); + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + gradWeight = + gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1), + gradWeight.size(2), gradWeight.size(3)}); + + for (int g = 0; g < group; g++) { + gradWeight[g] = gradWeight[g] + .flatten(1) + .addmm_(gradOutputBuffer[elt][g].flatten(1), + columns[g].transpose(1, 0), 1.0, scale) + .view_as(gradWeight[g]); + } + gradOutputBuffer = gradOutputBuffer.view( + {gradOutputBuffer.size(0), + gradOutputBuffer.size(1) * gradOutputBuffer.size(2), + gradOutputBuffer.size(3), gradOutputBuffer.size(4)}); + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1), + gradWeight.size(2), gradWeight.size(3), + gradWeight.size(4)}); + } + + input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); + offset = offset.view( + {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + + if (batch == 0) { + gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth}); + input = input.view({nInputPlane, inputHeight, inputWidth}); + } + + return 1; +} + +void modulated_deform_conv_cuda_forward( + at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones, + at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns, + int kernel_h, int kernel_w, const int stride_h, const int stride_w, + const int pad_h, const int pad_w, const int dilation_h, + const int dilation_w, const int group, const int deformable_group, + const bool with_bias) { + AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); + AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + + const int channels_out = weight.size(0); + const int channels_kernel = weight.size(1); + const int kernel_h_ = weight.size(2); + const int kernel_w_ = weight.size(3); + + if (kernel_h_ != kernel_h || kernel_w_ != kernel_w) + AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).", + kernel_h_, kernel_w, kernel_h_, kernel_w_); + if (channels != channels_kernel * group) + AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).", + channels, channels_kernel * group); + + const int height_out = + (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_out = + (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + + if (ones.ndimension() != 2 || + ones.size(0) * ones.size(1) < height_out * width_out) { + // Resize plane and fill with ones... + ones = at::ones({height_out, width_out}, input.options()); + } + + // resize output + output = output.view({batch, channels_out, height_out, width_out}).zero_(); + // resize temporary columns + columns = + at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out}, + input.options()); + + output = output.view({output.size(0), group, output.size(1) / group, + output.size(2), output.size(3)}); + + for (int b = 0; b < batch; b++) { + modulated_deformable_im2col_cuda( + input[b], offset[b], mask[b], 1, channels, height, width, height_out, + width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, columns); + + // divide into group + weight = weight.view({group, weight.size(0) / group, weight.size(1), + weight.size(2), weight.size(3)}); + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + + for (int g = 0; g < group; g++) { + output[b][g] = output[b][g] + .flatten(1) + .addmm_(weight[g].flatten(1), columns[g]) + .view_as(output[b][g]); + } + + weight = weight.view({weight.size(0) * weight.size(1), weight.size(2), + weight.size(3), weight.size(4)}); + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + } + + output = output.view({output.size(0), output.size(1) * output.size(2), + output.size(3), output.size(4)}); + + if (with_bias) { + output += bias.view({1, bias.size(0), 1, 1}); + } +} + +void modulated_deform_conv_cuda_backward( + at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones, + at::Tensor offset, at::Tensor mask, at::Tensor columns, + at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias, + at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output, + int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, + int pad_w, int dilation_h, int dilation_w, int group, int deformable_group, + const bool with_bias) { + AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); + AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + + const int channels_kernel = weight.size(1); + const int kernel_h_ = weight.size(2); + const int kernel_w_ = weight.size(3); + if (kernel_h_ != kernel_h || kernel_w_ != kernel_w) + AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).", + kernel_h_, kernel_w, kernel_h_, kernel_w_); + if (channels != channels_kernel * group) + AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).", + channels, channels_kernel * group); + + const int height_out = + (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_out = + (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + + if (ones.ndimension() != 2 || + ones.size(0) * ones.size(1) < height_out * width_out) { + // Resize plane and fill with ones... + ones = at::ones({height_out, width_out}, input.options()); + } + + grad_input = grad_input.view({batch, channels, height, width}); + columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out}, + input.options()); + + grad_output = + grad_output.view({grad_output.size(0), group, grad_output.size(1) / group, + grad_output.size(2), grad_output.size(3)}); + + for (int b = 0; b < batch; b++) { + // divide int group + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + weight = weight.view({group, weight.size(0) / group, weight.size(1), + weight.size(2), weight.size(3)}); + + for (int g = 0; g < group; g++) { + columns[g].addmm_(weight[g].flatten(1).transpose(0, 1), + grad_output[b][g].flatten(1), 0.0f, 1.0f); + } + + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + weight = weight.view({weight.size(0) * weight.size(1), weight.size(2), + weight.size(3), weight.size(4)}); + + // gradient w.r.t. input coordinate data + modulated_deformable_col2im_coord_cuda( + columns, input[b], offset[b], mask[b], 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, + stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b], + grad_mask[b]); + // gradient w.r.t. input data + modulated_deformable_col2im_cuda( + columns, offset[b], mask[b], 1, channels, height, width, height_out, + width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, grad_input[b]); + + // gradient w.r.t. weight, dWeight should accumulate across the batch and + // group + modulated_deformable_im2col_cuda( + input[b], offset[b], mask[b], 1, channels, height, width, height_out, + width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, columns); + + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + grad_weight = grad_weight.view({group, grad_weight.size(0) / group, + grad_weight.size(1), grad_weight.size(2), + grad_weight.size(3)}); + if (with_bias) + grad_bias = grad_bias.view({group, grad_bias.size(0) / group}); + + for (int g = 0; g < group; g++) { + grad_weight[g] = + grad_weight[g] + .flatten(1) + .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1)) + .view_as(grad_weight[g]); + if (with_bias) { + grad_bias[g] = + grad_bias[g] + .view({-1, 1}) + .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1})) + .view(-1); + } + } + + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1), + grad_weight.size(2), grad_weight.size(3), + grad_weight.size(4)}); + if (with_bias) + grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)}); + } + grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1), + grad_output.size(2), grad_output.size(3), + grad_output.size(4)}); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("deform_conv_forward_cuda", &deform_conv_forward_cuda, + "deform forward (CUDA)"); + m.def("deform_conv_backward_input_cuda", &deform_conv_backward_input_cuda, + "deform_conv_backward_input (CUDA)"); + m.def("deform_conv_backward_parameters_cuda", + &deform_conv_backward_parameters_cuda, + "deform_conv_backward_parameters (CUDA)"); + m.def("modulated_deform_conv_cuda_forward", + &modulated_deform_conv_cuda_forward, + "modulated deform conv forward (CUDA)"); + m.def("modulated_deform_conv_cuda_backward", + &modulated_deform_conv_cuda_backward, + "modulated deform conv backward (CUDA)"); +} diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_conv_cuda_kernel.cu b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_conv_cuda_kernel.cu new file mode 100644 index 00000000..fd560163 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_conv_cuda_kernel.cu @@ -0,0 +1,866 @@ +/*! + ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** + * + * COPYRIGHT + * + * All contributions by the University of California: + * Copyright (c) 2014-2017 The Regents of the University of California (Regents) + * All rights reserved. + * + * All other contributions: + * Copyright (c) 2014-2017, the respective contributors + * All rights reserved. + * + * Caffe uses a shared copyright model: each contributor holds copyright over + * their contributions to Caffe. The project versioning records all such + * contribution and copyright details. If a contributor wants to further mark + * their specific copyright on a particular contribution, they should indicate + * their copyright solely in the commit message of the change when it is + * committed. + * + * LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * CONTRIBUTION AGREEMENT + * + * By contributing to the BVLC/caffe repository through pull-request, comment, + * or otherwise, the contributor releases their content to the + * license and copyright terms herein. + * + ***************** END Caffe Copyright Notice and Disclaimer ******************** + * + * Copyright (c) 2018 Microsoft + * Licensed under The MIT License [see LICENSE for details] + * \file modulated_deformable_im2col.cuh + * \brief Function definitions of converting an image to + * column matrix based on kernel, padding, dilation, and offset. + * These functions are mainly used in deformable convolution operators. + * \ref: https://arxiv.org/abs/1703.06211 + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng + */ + +// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu + +#include +#include +#include +#include +#include + +using namespace at; + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +const int kMaxGridNum = 65535; + +inline int GET_BLOCKS(const int N) +{ + return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS); +} + +template +__device__ scalar_t deformable_im2col_bilinear(const scalar_t *bottom_data, const int data_width, + const int height, const int width, scalar_t h, scalar_t w) +{ + + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + scalar_t lh = h - h_low; + scalar_t lw = w - w_low; + scalar_t hh = 1 - lh, hw = 1 - lw; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + v1 = bottom_data[h_low * data_width + w_low]; + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + v2 = bottom_data[h_low * data_width + w_high]; + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + v3 = bottom_data[h_high * data_width + w_low]; + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + v4 = bottom_data[h_high * data_width + w_high]; + + scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +template +__device__ scalar_t get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w, + const int h, const int w, const int height, const int width) +{ + + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + scalar_t weight = 0; + if (h == argmax_h_low && w == argmax_w_low) + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + if (h == argmax_h_low && w == argmax_w_high) + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + if (h == argmax_h_high && w == argmax_w_low) + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + if (h == argmax_h_high && w == argmax_w_high) + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + return weight; +} + +template +__device__ scalar_t get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w, + const int height, const int width, const scalar_t *im_data, + const int data_width, const int bp_dir) +{ + + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + scalar_t weight = 0; + + if (bp_dir == 0) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + else if (bp_dir == 1) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + + return weight; +} + +template +__global__ void deformable_im2col_gpu_kernel(const int n, const scalar_t *data_im, const scalar_t *data_offset, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, const int channel_per_deformable_group, + const int batch_size, const int num_channels, const int deformable_group, + const int height_col, const int width_col, + scalar_t *data_col) +{ + CUDA_KERNEL_LOOP(index, n) + { + // index index of output matrix + const int w_col = index % width_col; + const int h_col = (index / width_col) % height_col; + const int b_col = (index / width_col / height_col) % batch_size; + const int c_im = (index / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + // compute deformable group index + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + //const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in; + const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; + const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) + { + for (int j = 0; j < kernel_w; ++j) + { + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + scalar_t val = static_cast(0); + const scalar_t h_im = h_in + i * dilation_h + offset_h; + const scalar_t w_im = w_in + j * dilation_w + offset_w; + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) + { + //const scalar_t map_h = i * dilation_h + offset_h; + //const scalar_t map_w = j * dilation_w + offset_w; + //const int cur_height = height - h_in; + //const int cur_width = width - w_in; + //val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w); + val = deformable_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val; + data_col_ptr += batch_size * height_col * width_col; + } + } + } +} + +void deformable_im2col( + const at::Tensor data_im, const at::Tensor data_offset, const int channels, + const int height, const int width, const int ksize_h, const int ksize_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, const int parallel_imgs, + const int deformable_group, at::Tensor data_col) +{ + // num_axes should be smaller than block size + // todo: check parallel_imgs is correctly passed in + int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; + int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; + int num_kernels = channels * height_col * width_col * parallel_imgs; + int channel_per_deformable_group = channels / deformable_group; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_im.type(), "deformable_im2col_gpu", ([&] { + const scalar_t *data_im_ = data_im.data(); + const scalar_t *data_offset_ = data_offset.data(); + scalar_t *data_col_ = data_col.data(); + + deformable_im2col_gpu_kernel<<>>( + num_kernels, data_im_, data_offset_, height, width, ksize_h, ksize_w, + pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, + channel_per_deformable_group, parallel_imgs, channels, deformable_group, + height_col, width_col, data_col_); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in deformable_im2col: %s\n", cudaGetErrorString(err)); + } +} + +template +__global__ void deformable_col2im_gpu_kernel( + const int n, const scalar_t *data_col, const scalar_t *data_offset, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int deformable_group, + const int height_col, const int width_col, + scalar_t *grad_im) +{ + CUDA_KERNEL_LOOP(index, n) + { + const int j = (index / width_col / height_col / batch_size) % kernel_w; + const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; + // compute the start and end of the output + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = index % width_col; + int h_out = (index / width_col) % height_col; + int b = (index / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * + 2 * kernel_h * kernel_w * height_col * width_col; + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h; + const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const scalar_t cur_top_grad = data_col[index]; + const int cur_h = (int)cur_inv_h_data; + const int cur_w = (int)cur_inv_w_data; + for (int dy = -2; dy <= 2; dy++) + { + for (int dx = -2; dx <= 2; dx++) + { + if (cur_h + dy >= 0 && cur_h + dy < height && + cur_w + dx >= 0 && cur_w + dx < width && + abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) + { + int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + scalar_t weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); + atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + } + } + } + } +} + +void deformable_col2im( + const at::Tensor data_col, const at::Tensor data_offset, const int channels, + const int height, const int width, const int ksize_h, + const int ksize_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int parallel_imgs, const int deformable_group, + at::Tensor grad_im) +{ + + // todo: make sure parallel_imgs is passed in correctly + int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; + int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; + int num_kernels = channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs; + int channel_per_deformable_group = channels / deformable_group; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_col.type(), "deformable_col2im_gpu", ([&] { + const scalar_t *data_col_ = data_col.data(); + const scalar_t *data_offset_ = data_offset.data(); + scalar_t *grad_im_ = grad_im.data(); + + deformable_col2im_gpu_kernel<<>>( + num_kernels, data_col_, data_offset_, channels, height, width, ksize_h, + ksize_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + parallel_imgs, deformable_group, height_col, width_col, grad_im_); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in deformable_col2im: %s\n", cudaGetErrorString(err)); + } +} + +template +__global__ void deformable_col2im_coord_gpu_kernel(const int n, const scalar_t *data_col, + const scalar_t *data_im, const scalar_t *data_offset, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int offset_channels, const int deformable_group, + const int height_col, const int width_col, scalar_t *grad_offset) +{ + CUDA_KERNEL_LOOP(index, n) + { + scalar_t val = 0; + int w = index % width_col; + int h = (index / width_col) % height_col; + int c = (index / width_col / height_col) % offset_channels; + int b = (index / width_col / height_col) / offset_channels; + // compute the start and end of the output + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * + batch_size * width_col * height_col; + const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * + channel_per_deformable_group / kernel_h / kernel_w * height * width; + const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * + kernel_h * kernel_w * height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) + { + const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + scalar_t inv_h = h_in + i * dilation_h + offset_h; + scalar_t inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) + { + inv_h = inv_w = -2; + } + const scalar_t weight = get_coordinate_weight( + inv_h, inv_w, + height, width, data_im_ptr + cnt * height * width, width, bp_dir); + val += weight * data_col_ptr[col_pos]; + cnt += 1; + } + + grad_offset[index] = val; + } +} + +void deformable_col2im_coord( + const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, + const int channels, const int height, const int width, const int ksize_h, + const int ksize_w, const int pad_h, const int pad_w, const int stride_h, + const int stride_w, const int dilation_h, const int dilation_w, + const int parallel_imgs, const int deformable_group, at::Tensor grad_offset) +{ + + int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; + int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; + int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w * deformable_group * parallel_imgs; + int channel_per_deformable_group = channels * ksize_h * ksize_w / deformable_group; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_col.type(), "deformable_col2im_coord_gpu", ([&] { + const scalar_t *data_col_ = data_col.data(); + const scalar_t *data_im_ = data_im.data(); + const scalar_t *data_offset_ = data_offset.data(); + scalar_t *grad_offset_ = grad_offset.data(); + + deformable_col2im_coord_gpu_kernel<<>>( + num_kernels, data_col_, data_im_, data_offset_, channels, height, width, + ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + parallel_imgs, 2 * ksize_h * ksize_w * deformable_group, deformable_group, + height_col, width_col, grad_offset_); + })); +} + +template +__device__ scalar_t dmcn_im2col_bilinear(const scalar_t *bottom_data, const int data_width, + const int height, const int width, scalar_t h, scalar_t w) +{ + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + scalar_t lh = h - h_low; + scalar_t lw = w - w_low; + scalar_t hh = 1 - lh, hw = 1 - lw; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + v1 = bottom_data[h_low * data_width + w_low]; + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + v2 = bottom_data[h_low * data_width + w_high]; + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + v3 = bottom_data[h_high * data_width + w_low]; + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + v4 = bottom_data[h_high * data_width + w_high]; + + scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +template +__device__ scalar_t dmcn_get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w, + const int h, const int w, const int height, const int width) +{ + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + scalar_t weight = 0; + if (h == argmax_h_low && w == argmax_w_low) + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + if (h == argmax_h_low && w == argmax_w_high) + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + if (h == argmax_h_high && w == argmax_w_low) + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + if (h == argmax_h_high && w == argmax_w_high) + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + return weight; +} + +template +__device__ scalar_t dmcn_get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w, + const int height, const int width, const scalar_t *im_data, + const int data_width, const int bp_dir) +{ + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + scalar_t weight = 0; + + if (bp_dir == 0) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + else if (bp_dir == 1) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + + return weight; +} + +template +__global__ void modulated_deformable_im2col_gpu_kernel(const int n, + const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int num_channels, const int deformable_group, + const int height_col, const int width_col, + scalar_t *data_col) +{ + CUDA_KERNEL_LOOP(index, n) + { + // index index of output matrix + const int w_col = index % width_col; + const int h_col = (index / width_col) % height_col; + const int b_col = (index / width_col / height_col) % batch_size; + const int c_im = (index / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + // compute deformable group index + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + + scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in; + const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; + const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + + const scalar_t *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) + { + for (int j = 0; j < kernel_w; ++j) + { + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; + const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + const scalar_t mask = data_mask_ptr[data_mask_hw_ptr]; + scalar_t val = static_cast(0); + const scalar_t h_im = h_in + i * dilation_h + offset_h; + const scalar_t w_im = w_in + j * dilation_w + offset_w; + //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) + { + //const float map_h = i * dilation_h + offset_h; + //const float map_w = j * dilation_w + offset_w; + //const int cur_height = height - h_in; + //const int cur_width = width - w_in; + //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w); + val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val * mask; + data_col_ptr += batch_size * height_col * width_col; + //data_col_ptr += height_col * width_col; + } + } + } +} + +template +__global__ void modulated_deformable_col2im_gpu_kernel(const int n, + const scalar_t *data_col, const scalar_t *data_offset, const scalar_t *data_mask, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int deformable_group, + const int height_col, const int width_col, + scalar_t *grad_im) +{ + CUDA_KERNEL_LOOP(index, n) + { + const int j = (index / width_col / height_col / batch_size) % kernel_w; + const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; + // compute the start and end of the output + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = index % width_col; + int h_out = (index / width_col) % height_col; + int b = (index / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + const scalar_t mask = data_mask_ptr[data_mask_hw_ptr]; + const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h; + const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const scalar_t cur_top_grad = data_col[index] * mask; + const int cur_h = (int)cur_inv_h_data; + const int cur_w = (int)cur_inv_w_data; + for (int dy = -2; dy <= 2; dy++) + { + for (int dx = -2; dx <= 2; dx++) + { + if (cur_h + dy >= 0 && cur_h + dy < height && + cur_w + dx >= 0 && cur_w + dx < width && + abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) + { + int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + scalar_t weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); + atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + } + } + } + } +} + +template +__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n, + const scalar_t *data_col, const scalar_t *data_im, + const scalar_t *data_offset, const scalar_t *data_mask, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int offset_channels, const int deformable_group, + const int height_col, const int width_col, + scalar_t *grad_offset, scalar_t *grad_mask) +{ + CUDA_KERNEL_LOOP(index, n) + { + scalar_t val = 0, mval = 0; + int w = index % width_col; + int h = (index / width_col) % height_col; + int c = (index / width_col / height_col) % offset_channels; + int b = (index / width_col / height_col) / offset_channels; + // compute the start and end of the output + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; + const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; + const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) + { + const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); + const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + const scalar_t mask = data_mask_ptr[data_mask_hw_ptr]; + scalar_t inv_h = h_in + i * dilation_h + offset_h; + scalar_t inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) + { + inv_h = inv_w = -2; + } + else + { + mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w); + } + const scalar_t weight = dmcn_get_coordinate_weight( + inv_h, inv_w, + height, width, data_im_ptr + cnt * height * width, width, bp_dir); + val += weight * data_col_ptr[col_pos] * mask; + cnt += 1; + } + // KERNEL_ASSIGN(grad_offset[index], offset_req, val); + grad_offset[index] = val; + if (offset_c % 2 == 0) + // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval); + grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval; + } +} + +void modulated_deformable_im2col_cuda( + const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, at::Tensor data_col) +{ + // num_axes should be smaller than block size + const int channel_per_deformable_group = channels / deformable_group; + const int num_kernels = channels * batch_size * height_col * width_col; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_im.type(), "modulated_deformable_im2col_gpu", ([&] { + const scalar_t *data_im_ = data_im.data(); + const scalar_t *data_offset_ = data_offset.data(); + const scalar_t *data_mask_ = data_mask.data(); + scalar_t *data_col_ = data_col.data(); + + modulated_deformable_im2col_gpu_kernel<<>>( + num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w, + pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, + batch_size, channels, deformable_group, height_col, width_col, data_col_); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); + } +} + +void modulated_deformable_col2im_cuda( + const at::Tensor data_col, const at::Tensor data_offset, const at::Tensor data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, at::Tensor grad_im) +{ + + const int channel_per_deformable_group = channels / deformable_group; + const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_col.type(), "modulated_deformable_col2im_gpu", ([&] { + const scalar_t *data_col_ = data_col.data(); + const scalar_t *data_offset_ = data_offset.data(); + const scalar_t *data_mask_ = data_mask.data(); + scalar_t *grad_im_ = grad_im.data(); + + modulated_deformable_col2im_gpu_kernel<<>>( + num_kernels, data_col_, data_offset_, data_mask_, channels, height_im, width_im, + kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + batch_size, deformable_group, height_col, width_col, grad_im_); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); + } +} + +void modulated_deformable_col2im_coord_cuda( + const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, + at::Tensor grad_offset, at::Tensor grad_mask) +{ + const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group; + const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_col.type(), "modulated_deformable_col2im_coord_gpu", ([&] { + const scalar_t *data_col_ = data_col.data(); + const scalar_t *data_im_ = data_im.data(); + const scalar_t *data_offset_ = data_offset.data(); + const scalar_t *data_mask_ = data_mask.data(); + scalar_t *grad_offset_ = grad_offset.data(); + scalar_t *grad_mask_ = grad_mask.data(); + + modulated_deformable_col2im_coord_gpu_kernel<<>>( + num_kernels, data_col_, data_im_, data_offset_, data_mask_, channels, height_im, width_im, + kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, + grad_offset_, grad_mask_); + })); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err)); + } +} diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_pool_cuda.cpp b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_pool_cuda.cpp new file mode 100644 index 00000000..803d5f14 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_pool_cuda.cpp @@ -0,0 +1,87 @@ +// modify from +// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c + +// based on +// author: Charles Shang +// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu + +#include + +#include +#include + +void DeformablePSROIPoolForward( + const at::Tensor data, const at::Tensor bbox, const at::Tensor trans, + at::Tensor out, at::Tensor top_count, const int batch, const int channels, + const int height, const int width, const int num_bbox, + const int channels_trans, const int no_trans, const float spatial_scale, + const int output_dim, const int group_size, const int pooled_size, + const int part_size, const int sample_per_part, const float trans_std); + +void DeformablePSROIPoolBackwardAcc( + const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox, + const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad, + at::Tensor trans_grad, const int batch, const int channels, + const int height, const int width, const int num_bbox, + const int channels_trans, const int no_trans, const float spatial_scale, + const int output_dim, const int group_size, const int pooled_size, + const int part_size, const int sample_per_part, const float trans_std); + +void deform_psroi_pooling_cuda_forward( + at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out, + at::Tensor top_count, const int no_trans, const float spatial_scale, + const int output_dim, const int group_size, const int pooled_size, + const int part_size, const int sample_per_part, const float trans_std) { + AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + const int channels_trans = no_trans ? 2 : trans.size(1); + + const int num_bbox = bbox.size(0); + if (num_bbox != out.size(0)) + AT_ERROR("Output shape and bbox number wont match: (%d vs %d).", + out.size(0), num_bbox); + + DeformablePSROIPoolForward( + input, bbox, trans, out, top_count, batch, channels, height, width, + num_bbox, channels_trans, no_trans, spatial_scale, output_dim, group_size, + pooled_size, part_size, sample_per_part, trans_std); +} + +void deform_psroi_pooling_cuda_backward( + at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans, + at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad, + const int no_trans, const float spatial_scale, const int output_dim, + const int group_size, const int pooled_size, const int part_size, + const int sample_per_part, const float trans_std) { + AT_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous"); + AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + const int channels_trans = no_trans ? 2 : trans.size(1); + + const int num_bbox = bbox.size(0); + if (num_bbox != out_grad.size(0)) + AT_ERROR("Output shape and bbox number wont match: (%d vs %d).", + out_grad.size(0), num_bbox); + + DeformablePSROIPoolBackwardAcc( + out_grad, input, bbox, trans, top_count, input_grad, trans_grad, batch, + channels, height, width, num_bbox, channels_trans, no_trans, + spatial_scale, output_dim, group_size, pooled_size, part_size, + sample_per_part, trans_std); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("deform_psroi_pooling_cuda_forward", &deform_psroi_pooling_cuda_forward, + "deform psroi pooling forward(CUDA)"); + m.def("deform_psroi_pooling_cuda_backward", + &deform_psroi_pooling_cuda_backward, + "deform psroi pooling backward(CUDA)"); +} \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_pool_cuda_kernel.cu b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_pool_cuda_kernel.cu new file mode 100644 index 00000000..e4944600 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_pool_cuda_kernel.cu @@ -0,0 +1,364 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The MIT License [see LICENSE for details] + * \file deformable_psroi_pooling.cu + * \brief + * \author Yi Li, Guodong Zhang, Jifeng Dai +*/ +/***************** Adapted by Charles Shang *********************/ +// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/cuda/deform_psroi_pooling_cuda.cu + +#include +#include +#include +#include +#include + +using namespace at; + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N) +{ + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +template +__device__ scalar_t bilinear_interp( + const scalar_t *data, + const scalar_t x, + const scalar_t y, + const int width, + const int height) +{ + int x1 = floor(x); + int x2 = ceil(x); + int y1 = floor(y); + int y2 = ceil(y); + scalar_t dist_x = (scalar_t)(x - x1); + scalar_t dist_y = (scalar_t)(y - y1); + scalar_t value11 = data[y1 * width + x1]; + scalar_t value12 = data[y2 * width + x1]; + scalar_t value21 = data[y1 * width + x2]; + scalar_t value22 = data[y2 * width + x2]; + scalar_t value = (1 - dist_x) * (1 - dist_y) * value11 + (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 + dist_x * dist_y * value22; + return value; +} + +template +__global__ void DeformablePSROIPoolForwardKernel( + const int count, + const scalar_t *bottom_data, + const scalar_t spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const scalar_t *bottom_rois, const scalar_t *bottom_trans, + const int no_trans, + const scalar_t trans_std, + const int sample_per_part, + const int output_dim, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class, + scalar_t *top_data, + scalar_t *top_count) +{ + CUDA_KERNEL_LOOP(index, count) + { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const scalar_t *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height); + scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width); + + scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part); + scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part); + + int part_h = floor((scalar_t)(ph) / pooled_height * part_size); + int part_w = floor((scalar_t)(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std; + scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std; + + scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + scalar_t sum = 0; + int count = 0; + int gw = floor((scalar_t)(pw)*group_size / pooled_width); + int gh = floor((scalar_t)(ph)*group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + + const scalar_t *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width; + for (int ih = 0; ih < sample_per_part; ih++) + { + for (int iw = 0; iw < sample_per_part; iw++) + { + scalar_t w = wstart + iw * sub_bin_size_w; + scalar_t h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) + { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop * group_size + gh) * group_size + gw; + scalar_t val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height); + sum += val; + count++; + } + } + top_data[index] = count == 0 ? (scalar_t)(0) : sum / count; + top_count[index] = count; + } +} + +template +__global__ void DeformablePSROIPoolBackwardAccKernel( + const int count, + const scalar_t *top_diff, + const scalar_t *top_count, + const int num_rois, + const scalar_t spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int output_dim, + scalar_t *bottom_data_diff, scalar_t *bottom_trans_diff, + const scalar_t *bottom_data, + const scalar_t *bottom_rois, + const scalar_t *bottom_trans, + const int no_trans, + const scalar_t trans_std, + const int sample_per_part, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class) +{ + CUDA_KERNEL_LOOP(index, count) + { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const scalar_t *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height); + scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width); + + scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part); + scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part); + + int part_h = floor((scalar_t)(ph) / pooled_height * part_size); + int part_w = floor((scalar_t)(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std; + scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std; + + scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + if (top_count[index] <= 0) + { + continue; + } + scalar_t diff_val = top_diff[index] / top_count[index]; + const scalar_t *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; + scalar_t *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width; + int gw = floor((scalar_t)(pw)*group_size / pooled_width); + int gh = floor((scalar_t)(ph)*group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + + for (int ih = 0; ih < sample_per_part; ih++) + { + for (int iw = 0; iw < sample_per_part; iw++) + { + scalar_t w = wstart + iw * sub_bin_size_w; + scalar_t h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) + { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop * group_size + gh) * group_size + gw; + // backward on feature + int x0 = floor(w); + int x1 = ceil(w); + int y0 = floor(h); + int y1 = ceil(h); + scalar_t dist_x = w - x0, dist_y = h - y0; + scalar_t q00 = (1 - dist_x) * (1 - dist_y); + scalar_t q01 = (1 - dist_x) * dist_y; + scalar_t q10 = dist_x * (1 - dist_y); + scalar_t q11 = dist_x * dist_y; + int bottom_index_base = c * height * width; + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val); + + if (no_trans) + { + continue; + } + scalar_t U00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; + scalar_t U01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; + scalar_t U10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; + scalar_t U11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; + scalar_t diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val; + diff_x *= roi_width; + scalar_t diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val; + diff_y *= roi_height; + + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x); + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y); + } + } + } +} + +void DeformablePSROIPoolForward(const at::Tensor data, + const at::Tensor bbox, + const at::Tensor trans, + at::Tensor out, + at::Tensor top_count, + const int batch, + const int channels, + const int height, + const int width, + const int num_bbox, + const int channels_trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + const int pooled_height = pooled_size; + const int pooled_width = pooled_size; + const int count = num_bbox * output_dim * pooled_height * pooled_width; + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data.type(), "deformable_psroi_pool_forward", ([&] { + const scalar_t *bottom_data = data.data(); + const scalar_t *bottom_rois = bbox.data(); + const scalar_t *bottom_trans = no_trans ? NULL : trans.data(); + scalar_t *top_data = out.data(); + scalar_t *top_count_data = top_count.data(); + + DeformablePSROIPoolForwardKernel<<>>( + count, bottom_data, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width, + bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part, output_dim, + group_size, part_size, num_classes, channels_each_class, top_data, top_count_data); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err)); + } +} + +void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad, + const at::Tensor data, + const at::Tensor bbox, + const at::Tensor trans, + const at::Tensor top_count, + at::Tensor in_grad, + at::Tensor trans_grad, + const int batch, + const int channels, + const int height, + const int width, + const int num_bbox, + const int channels_trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + // LOG(INFO) << "DeformablePSROIPoolBackward"; + const int num_rois = num_bbox; + const int pooled_height = pooled_size; + const int pooled_width = pooled_size; + const int count = num_bbox * output_dim * pooled_height * pooled_width; + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + out_grad.type(), "deformable_psroi_pool_backward_acc", ([&] { + const scalar_t *top_diff = out_grad.data(); + const scalar_t *bottom_data = data.data(); + const scalar_t *bottom_rois = bbox.data(); + const scalar_t *bottom_trans = no_trans ? NULL : trans.data(); + scalar_t *bottom_data_diff = in_grad.data(); + scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data(); + const scalar_t *top_count_data = top_count.data(); + + DeformablePSROIPoolBackwardAccKernel<<>>( + count, top_diff, top_count_data, num_rois, (scalar_t)spatial_scale, channels, height, width, + pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff, + bottom_data, bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part, + group_size, part_size, num_classes, channels_each_class); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err)); + } +} \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dlav0.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dlav0.py new file mode 100644 index 00000000..92fdbcf8 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dlav0.py @@ -0,0 +1,647 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +from os.path import join + +import torch +from torch import nn +import torch.utils.model_zoo as model_zoo + +import numpy as np + +BatchNorm = nn.BatchNorm2d + +def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'): + return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash)) + + +def conv3x3(in_planes, out_planes, stride=1): + "3x3 convolution with padding" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +class BasicBlock(nn.Module): + def __init__(self, inplanes, planes, stride=1, dilation=1): + super(BasicBlock, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, + stride=stride, padding=dilation, + bias=False, dilation=dilation) + self.bn1 = BatchNorm(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, + stride=1, padding=dilation, + bias=False, dilation=dilation) + self.bn2 = BatchNorm(planes) + self.stride = stride + + def forward(self, x, residual=None): + if residual is None: + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 2 + + def __init__(self, inplanes, planes, stride=1, dilation=1): + super(Bottleneck, self).__init__() + expansion = Bottleneck.expansion + bottle_planes = planes // expansion + self.conv1 = nn.Conv2d(inplanes, bottle_planes, + kernel_size=1, bias=False) + self.bn1 = BatchNorm(bottle_planes) + self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3, + stride=stride, padding=dilation, + bias=False, dilation=dilation) + self.bn2 = BatchNorm(bottle_planes) + self.conv3 = nn.Conv2d(bottle_planes, planes, + kernel_size=1, bias=False) + self.bn3 = BatchNorm(planes) + self.relu = nn.ReLU(inplace=True) + self.stride = stride + + def forward(self, x, residual=None): + if residual is None: + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + out += residual + out = self.relu(out) + + return out + + +class BottleneckX(nn.Module): + expansion = 2 + cardinality = 32 + + def __init__(self, inplanes, planes, stride=1, dilation=1): + super(BottleneckX, self).__init__() + cardinality = BottleneckX.cardinality + # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0))) + # bottle_planes = dim * cardinality + bottle_planes = planes * cardinality // 32 + self.conv1 = nn.Conv2d(inplanes, bottle_planes, + kernel_size=1, bias=False) + self.bn1 = BatchNorm(bottle_planes) + self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3, + stride=stride, padding=dilation, bias=False, + dilation=dilation, groups=cardinality) + self.bn2 = BatchNorm(bottle_planes) + self.conv3 = nn.Conv2d(bottle_planes, planes, + kernel_size=1, bias=False) + self.bn3 = BatchNorm(planes) + self.relu = nn.ReLU(inplace=True) + self.stride = stride + + def forward(self, x, residual=None): + if residual is None: + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + out += residual + out = self.relu(out) + + return out + + +class Root(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, residual): + super(Root, self).__init__() + self.conv = nn.Conv2d( + in_channels, out_channels, 1, + stride=1, bias=False, padding=(kernel_size - 1) // 2) + self.bn = BatchNorm(out_channels) + self.relu = nn.ReLU(inplace=True) + self.residual = residual + + def forward(self, *x): + children = x + x = self.conv(torch.cat(x, 1)) + x = self.bn(x) + if self.residual: + x += children[0] + x = self.relu(x) + + return x + + +class Tree(nn.Module): + def __init__(self, levels, block, in_channels, out_channels, stride=1, + level_root=False, root_dim=0, root_kernel_size=1, + dilation=1, root_residual=False): + super(Tree, self).__init__() + if root_dim == 0: + root_dim = 2 * out_channels + if level_root: + root_dim += in_channels + if levels == 1: + self.tree1 = block(in_channels, out_channels, stride, + dilation=dilation) + self.tree2 = block(out_channels, out_channels, 1, + dilation=dilation) + else: + self.tree1 = Tree(levels - 1, block, in_channels, out_channels, + stride, root_dim=0, + root_kernel_size=root_kernel_size, + dilation=dilation, root_residual=root_residual) + self.tree2 = Tree(levels - 1, block, out_channels, out_channels, + root_dim=root_dim + out_channels, + root_kernel_size=root_kernel_size, + dilation=dilation, root_residual=root_residual) + if levels == 1: + self.root = Root(root_dim, out_channels, root_kernel_size, + root_residual) + self.level_root = level_root + self.root_dim = root_dim + self.downsample = None + self.project = None + self.levels = levels + if stride > 1: + self.downsample = nn.MaxPool2d(stride, stride=stride) + if in_channels != out_channels: + self.project = nn.Sequential( + nn.Conv2d(in_channels, out_channels, + kernel_size=1, stride=1, bias=False), + BatchNorm(out_channels) + ) + + def forward(self, x, residual=None, children=None): + children = [] if children is None else children + bottom = self.downsample(x) if self.downsample else x + residual = self.project(bottom) if self.project else bottom + if self.level_root: + children.append(bottom) + x1 = self.tree1(x, residual) + if self.levels == 1: + x2 = self.tree2(x1) + x = self.root(x2, x1, *children) + else: + children.append(x1) + x = self.tree2(x1, children=children) + return x + + +class DLA(nn.Module): + def __init__(self, levels, channels, num_classes=1000, + block=BasicBlock, residual_root=False, return_levels=False, + pool_size=7, linear_root=False): + super(DLA, self).__init__() + self.channels = channels + self.return_levels = return_levels + self.num_classes = num_classes + self.base_layer = nn.Sequential( + nn.Conv2d(3, channels[0], kernel_size=7, stride=1, + padding=3, bias=False), + BatchNorm(channels[0]), + nn.ReLU(inplace=True)) + self.level0 = self._make_conv_level( + channels[0], channels[0], levels[0]) + self.level1 = self._make_conv_level( + channels[0], channels[1], levels[1], stride=2) + self.level2 = Tree(levels[2], block, channels[1], channels[2], 2, + level_root=False, + root_residual=residual_root) + self.level3 = Tree(levels[3], block, channels[2], channels[3], 2, + level_root=True, root_residual=residual_root) + self.level4 = Tree(levels[4], block, channels[3], channels[4], 2, + level_root=True, root_residual=residual_root) + self.level5 = Tree(levels[5], block, channels[4], channels[5], 2, + level_root=True, root_residual=residual_root) + + self.avgpool = nn.AvgPool2d(pool_size) + self.fc = nn.Conv2d(channels[-1], num_classes, kernel_size=1, + stride=1, padding=0, bias=True) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + elif isinstance(m, BatchNorm): + m.weight.data.fill_(1) + m.bias.data.zero_() + + def _make_level(self, block, inplanes, planes, blocks, stride=1): + downsample = None + if stride != 1 or inplanes != planes: + downsample = nn.Sequential( + nn.MaxPool2d(stride, stride=stride), + nn.Conv2d(inplanes, planes, + kernel_size=1, stride=1, bias=False), + BatchNorm(planes), + ) + + layers = [] + layers.append(block(inplanes, planes, stride, downsample=downsample)) + for i in range(1, blocks): + layers.append(block(inplanes, planes)) + + return nn.Sequential(*layers) + + def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1): + modules = [] + for i in range(convs): + modules.extend([ + nn.Conv2d(inplanes, planes, kernel_size=3, + stride=stride if i == 0 else 1, + padding=dilation, bias=False, dilation=dilation), + BatchNorm(planes), + nn.ReLU(inplace=True)]) + inplanes = planes + return nn.Sequential(*modules) + + def forward(self, x): + y = [] + x = self.base_layer(x) + for i in range(6): + x = getattr(self, 'level{}'.format(i))(x) + y.append(x) + if self.return_levels: + return y + else: + x = self.avgpool(x) + x = self.fc(x) + x = x.view(x.size(0), -1) + + return x + + def load_pretrained_model(self, data='imagenet', name='dla34', hash='ba72cf86'): + fc = self.fc + if name.endswith('.pth'): + model_weights = torch.load(data + name) + else: + model_url = get_model_url(data, name, hash) + model_weights = model_zoo.load_url(model_url) + num_classes = len(model_weights[list(model_weights.keys())[-1]]) + self.fc = nn.Conv2d( + self.channels[-1], num_classes, + kernel_size=1, stride=1, padding=0, bias=True) + self.load_state_dict(model_weights) + self.fc = fc + + +def dla34(pretrained, **kwargs): # DLA-34 + model = DLA([1, 1, 1, 2, 2, 1], + [16, 32, 64, 128, 256, 512], + block=BasicBlock, **kwargs) + if pretrained: + model.load_pretrained_model(data='imagenet', name='dla34', hash='ba72cf86') + return model + + +def dla46_c(pretrained=None, **kwargs): # DLA-46-C + Bottleneck.expansion = 2 + model = DLA([1, 1, 1, 2, 2, 1], + [16, 32, 64, 64, 128, 256], + block=Bottleneck, **kwargs) + if pretrained is not None: + model.load_pretrained_model(pretrained, 'dla46_c') + return model + + +def dla46x_c(pretrained=None, **kwargs): # DLA-X-46-C + BottleneckX.expansion = 2 + model = DLA([1, 1, 1, 2, 2, 1], + [16, 32, 64, 64, 128, 256], + block=BottleneckX, **kwargs) + if pretrained is not None: + model.load_pretrained_model(pretrained, 'dla46x_c') + return model + + +def dla60x_c(pretrained, **kwargs): # DLA-X-60-C + BottleneckX.expansion = 2 + model = DLA([1, 1, 1, 2, 3, 1], + [16, 32, 64, 64, 128, 256], + block=BottleneckX, **kwargs) + if pretrained: + model.load_pretrained_model(data='imagenet', name='dla60x_c', hash='b870c45c') + return model + + +def dla60(pretrained=None, **kwargs): # DLA-60 + Bottleneck.expansion = 2 + model = DLA([1, 1, 1, 2, 3, 1], + [16, 32, 128, 256, 512, 1024], + block=Bottleneck, **kwargs) + if pretrained is not None: + model.load_pretrained_model(pretrained, 'dla60') + return model + + +def dla60x(pretrained=None, **kwargs): # DLA-X-60 + BottleneckX.expansion = 2 + model = DLA([1, 1, 1, 2, 3, 1], + [16, 32, 128, 256, 512, 1024], + block=BottleneckX, **kwargs) + if pretrained is not None: + model.load_pretrained_model(pretrained, 'dla60x') + return model + + +def dla102(pretrained=None, **kwargs): # DLA-102 + Bottleneck.expansion = 2 + model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024], + block=Bottleneck, residual_root=True, **kwargs) + if pretrained is not None: + model.load_pretrained_model(pretrained, 'dla102') + return model + + +def dla102x(pretrained=None, **kwargs): # DLA-X-102 + BottleneckX.expansion = 2 + model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024], + block=BottleneckX, residual_root=True, **kwargs) + if pretrained is not None: + model.load_pretrained_model(pretrained, 'dla102x') + return model + + +def dla102x2(pretrained=None, **kwargs): # DLA-X-102 64 + BottleneckX.cardinality = 64 + model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024], + block=BottleneckX, residual_root=True, **kwargs) + if pretrained is not None: + model.load_pretrained_model(pretrained, 'dla102x2') + return model + + +def dla169(pretrained=None, **kwargs): # DLA-169 + Bottleneck.expansion = 2 + model = DLA([1, 1, 2, 3, 5, 1], [16, 32, 128, 256, 512, 1024], + block=Bottleneck, residual_root=True, **kwargs) + if pretrained is not None: + model.load_pretrained_model(pretrained, 'dla169') + return model + + +def set_bn(bn): + global BatchNorm + BatchNorm = bn + dla.BatchNorm = bn + + +class Identity(nn.Module): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, x): + return x + + +def fill_up_weights(up): + w = up.weight.data + f = math.ceil(w.size(2) / 2) + c = (2 * f - 1 - f % 2) / (2. * f) + for i in range(w.size(2)): + for j in range(w.size(3)): + w[0, 0, i, j] = \ + (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) + for c in range(1, w.size(0)): + w[c, 0, :, :] = w[0, 0, :, :] + + +class IDAUp(nn.Module): + def __init__(self, node_kernel, out_dim, channels, up_factors): + super(IDAUp, self).__init__() + self.channels = channels + self.out_dim = out_dim + for i, c in enumerate(channels): + if c == out_dim: + proj = Identity() + else: + proj = nn.Sequential( + nn.Conv2d(c, out_dim, + kernel_size=1, stride=1, bias=False), + BatchNorm(out_dim), + nn.ReLU(inplace=True)) + f = int(up_factors[i]) + if f == 1: + up = Identity() + else: + up = nn.ConvTranspose2d( + out_dim, out_dim, f * 2, stride=f, padding=f // 2, + output_padding=0, groups=out_dim, bias=False) + fill_up_weights(up) + setattr(self, 'proj_' + str(i), proj) + setattr(self, 'up_' + str(i), up) + + for i in range(1, len(channels)): + node = nn.Sequential( + nn.Conv2d(out_dim * 2, out_dim, + kernel_size=node_kernel, stride=1, + padding=node_kernel // 2, bias=False), + BatchNorm(out_dim), + nn.ReLU(inplace=True)) + setattr(self, 'node_' + str(i), node) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + elif isinstance(m, BatchNorm): + m.weight.data.fill_(1) + m.bias.data.zero_() + + def forward(self, layers): + assert len(self.channels) == len(layers), \ + '{} vs {} layers'.format(len(self.channels), len(layers)) + layers = list(layers) + for i, l in enumerate(layers): + upsample = getattr(self, 'up_' + str(i)) + project = getattr(self, 'proj_' + str(i)) + layers[i] = upsample(project(l)) + x = layers[0] + y = [] + for i in range(1, len(layers)): + node = getattr(self, 'node_' + str(i)) + x = node(torch.cat([x, layers[i]], 1)) + y.append(x) + return x, y + + +class DLAUp(nn.Module): + def __init__(self, channels, scales=(1, 2, 4, 8, 16), in_channels=None): + super(DLAUp, self).__init__() + if in_channels is None: + in_channels = channels + self.channels = channels + channels = list(channels) + scales = np.array(scales, dtype=int) + for i in range(len(channels) - 1): + j = -i - 2 + setattr(self, 'ida_{}'.format(i), + IDAUp(3, channels[j], in_channels[j:], + scales[j:] // scales[j])) + scales[j + 1:] = scales[j] + in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]] + + def forward(self, layers): + layers = list(layers) + assert len(layers) > 1 + for i in range(len(layers) - 1): + ida = getattr(self, 'ida_{}'.format(i)) + x, y = ida(layers[-i - 2:]) + layers[-i - 1:] = y + return x + +def fill_fc_weights(layers): + for m in layers.modules(): + if isinstance(m, nn.Conv2d): + nn.init.normal_(m.weight, std=0.001) + # torch.nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu') + # torch.nn.init.xavier_normal_(m.weight.data) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + +class DLASeg(nn.Module): + def __init__(self, base_name, heads, + pretrained=True, down_ratio=4, head_conv=256): + super(DLASeg, self).__init__() + assert down_ratio in [2, 4, 8, 16] + self.heads = heads + self.first_level = int(np.log2(down_ratio)) + self.base = globals()[base_name]( + pretrained=pretrained, return_levels=True) + channels = self.base.channels + scales = [2 ** i for i in range(len(channels[self.first_level:]))] + self.dla_up = DLAUp(channels[self.first_level:], scales=scales) + ''' + self.fc = nn.Sequential( + nn.Conv2d(channels[self.first_level], classes, kernel_size=1, + stride=1, padding=0, bias=True) + ) + ''' + + for head in self.heads: + classes = self.heads[head] + if head_conv > 0: + fc = nn.Sequential( + nn.Conv2d(channels[self.first_level], head_conv, + kernel_size=3, padding=1, bias=True), + nn.ReLU(inplace=True), + nn.Conv2d(head_conv, classes, + kernel_size=1, stride=1, + padding=0, bias=True)) + if 'hm' in head: + fc[-1].bias.data.fill_(-2.19) + else: + fill_fc_weights(fc) + else: + fc = nn.Conv2d(channels[self.first_level], classes, + kernel_size=1, stride=1, + padding=0, bias=True) + if 'hm' in head: + fc.bias.data.fill_(-2.19) + else: + fill_fc_weights(fc) + self.__setattr__(head, fc) + + ''' + up_factor = 2 ** self.first_level + if up_factor > 1: + up = nn.ConvTranspose2d(classes, classes, up_factor * 2, + stride=up_factor, padding=up_factor // 2, + output_padding=0, groups=classes, + bias=False) + fill_up_weights(up) + up.weight.requires_grad = False + else: + up = Identity() + self.up = up + self.softmax = nn.LogSoftmax(dim=1) + + + for m in self.fc.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + elif isinstance(m, BatchNorm): + m.weight.data.fill_(1) + m.bias.data.zero_() + ''' + + def forward(self, x): + x = self.base(x) + x = self.dla_up(x[self.first_level:]) + # x = self.fc(x) + # y = self.softmax(self.up(x)) + ret = {} + for head in self.heads: + ret[head] = self.__getattr__(head)(x) + return [ret] + + ''' + def optim_parameters(self, memo=None): + for param in self.base.parameters(): + yield param + for param in self.dla_up.parameters(): + yield param + for param in self.fc.parameters(): + yield param + ''' +''' +def dla34up(classes, pretrained_base=None, **kwargs): + model = DLASeg('dla34', classes, pretrained_base=pretrained_base, **kwargs) + return model + + +def dla60up(classes, pretrained_base=None, **kwargs): + model = DLASeg('dla60', classes, pretrained_base=pretrained_base, **kwargs) + return model + + +def dla102up(classes, pretrained_base=None, **kwargs): + model = DLASeg('dla102', classes, + pretrained_base=pretrained_base, **kwargs) + return model + + +def dla169up(classes, pretrained_base=None, **kwargs): + model = DLASeg('dla169', classes, + pretrained_base=pretrained_base, **kwargs) + return model +''' + +def get_pose_net(num_layers, heads, add_conv=256, down_ratio=4): + model = DLASeg('dla{}'.format(num_layers), heads, + pretrained=True, + down_ratio=down_ratio, + head_conv=head_conv) + return model diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_mask_resnet.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_mask_resnet.py new file mode 100644 index 00000000..c2ed2588 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_mask_resnet.py @@ -0,0 +1,363 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# Modified by Xingyi Zhou +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.model_zoo as model_zoo + +BN_MOMENTUM = 0.1 + +model_urls = { + 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', + 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', + 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', + 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', + 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', +} + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=0, bias=False) + +def pad_same(x,ksize,stride=1,Pool=False): + shape = x.shape + n_in,w,h = shape[1],shape[2],shape[3] + if h % stride == 0: + pad_along_height = max(ksize - stride, 0) + else: + pad_along_height = max(ksize - (h % stride), 0) + if w % stride == 0: + pad_along_width = max(ksize - stride, 0) + else: + pad_along_width = max(ksize - (w % stride), 0) + pad_bottom = pad_along_height // 2 + pad_top = pad_along_height - pad_bottom + pad_right = pad_along_width // 2 + pad_left = pad_along_width - pad_right + dim = (pad_left,pad_right,pad_top,pad_bottom) + if Pool: + dim = (pad_right,pad_left,pad_bottom,pad_top) + x = F.pad(x,dim,"constant",value=0) + return x + +class ChannelAttention(nn.Module): + def __init__(self, in_planes, ratio=16): + super(ChannelAttention, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.max_pool = nn.AdaptiveMaxPool2d(1) + + self.fc1 = nn.Conv2d(in_planes, in_planes // ratio, 1, bias=False) + self.relu1 = nn.ReLU() + self.fc2 = nn.Conv2d(in_planes // ratio, in_planes, 1, bias=False) + + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + avg_out = self.fc2(self.relu1(self.fc1(self.avg_pool(x)))) + max_out = self.fc2(self.relu1(self.fc1(self.max_pool(x)))) + + out = avg_out + max_out + + return self.sigmoid(out) + +class SpatialAttention(nn.Module): + def __init__(self): + super(SpatialAttention, self).__init__() + + self.conv1 = nn.Conv2d(2,1,3,padding=1,bias=False) + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + avg_out = torch.mean(x, dim=1, keepdim=True) + max_out,_ = torch.max(x, dim=1, keepdim=True) + x = torch.cat([avg_out,max_out],dim=1) + x = self.conv1(x) + return self.sigmoid(x) + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + x = pad_same(x,3,self.stride) + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = pad_same(out,3,1) + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(residual) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, + padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, + bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion, + momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class PoseResNet(nn.Module): + + def __init__(self, block, layers, heads, head_conv, **kwargs): + self.inplanes = 64 + self.deconv_with_bias = False + self.heads = heads + + super(PoseResNet, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=0, + bias=False) + self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=0) + self.layer1 = self._make_layer(block, 64, layers[0], stride=2) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2) + + self.adaption3 = nn.Conv2d(256,256, kernel_size=1, stride=1, padding=0,bias=False) + self.adaption2 = nn.Conv2d(128,256, kernel_size=1, stride=1, padding=0,bias=False) + self.adaption1 = nn.Conv2d(64 ,256, kernel_size=1, stride=1, padding=0,bias=False) + self.adaption0 = nn.Conv2d(64 ,256, kernel_size=1, stride=1, padding=0,bias=False) + + self.adaptionU1 = nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0,bias=False) + + # used for deconv layers + self.deconv_layers1 = self._make_deconv_layer(1, [256], [4],) + self.deconv_layers2 = self._make_deconv_layer(1, [256], [4],) + self.deconv_layers3 = self._make_deconv_layer(1, [256], [4],) + self.deconv_layers4 = self._make_deconv_layer(1, [256], [4],) + + # self.final_layer = [] + + for head in sorted(self.heads): + num_output = self.heads[head] + if head_conv > 0: + inchannel = 256 + fc = nn.Sequential( + nn.Conv2d(inchannel, head_conv, + kernel_size=3, padding=1, bias=True), + nn.ReLU(inplace=True), + nn.Conv2d(head_conv, num_output, + kernel_size=1, stride=1, padding=0)) + else: + inchannel = 256 + fc = nn.Conv2d( + in_channels=inchannel, + out_channels=num_output, + kernel_size=1, + stride=1, + padding=0 + ) + self.__setattr__(head, fc) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def _get_deconv_cfg(self, deconv_kernel, index): + if deconv_kernel == 4: + padding = 1 + output_padding = 0 + elif deconv_kernel == 3: + padding = 1 + output_padding = 1 + elif deconv_kernel == 2: + padding = 0 + output_padding = 0 + elif deconv_kernel == 7: + padding = 3 + output_padding = 0 + + return deconv_kernel, padding, output_padding + + def _make_deconv_layer(self, num_layers, num_filters, num_kernels): + assert num_layers == len(num_filters), \ + 'ERROR: num_deconv_layers is different len(num_deconv_filters)' + assert num_layers == len(num_kernels), \ + 'ERROR: num_deconv_layers is different len(num_deconv_filters)' + + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i], i) + + planes = num_filters[i] + layers.append( + nn.ConvTranspose2d( + in_channels=self.inplanes, + out_channels=planes, + kernel_size=kernel, + stride=2, + padding=padding, + output_padding=output_padding, + bias=self.deconv_with_bias)) + layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)) + layers.append(nn.ReLU(inplace=True)) + self.inplanes = planes + + return nn.Sequential(*layers) + + def save_map(self,x,name): + path = '/home/rujiao.lrj/pytorch2caffe/save_map.txt' + f = open(path,'a+') + f.write('----------------------------------') + f.write('%s\n'%name) + for i in range(3): + for j in range(16): + f.write('cln:%d,line:%d\n'%(i,j)) + string = '' + for k in range(16): + string = string + str(x[i][j][k]) + ' ' + f.write(string+'\n') + + def forward(self, x): + x = pad_same(x,7,2) + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x0 = pad_same(x,3,2,True) + x0 = self.maxpool(x0) + x1 = self.layer1(x0) + x2 = self.layer2(x1) + x3 = self.layer3(x2) + x4 = self.layer4(x3) + + x3_ = self.deconv_layers1(x4) + x3_ = self.adaption3(x3) + x3_ + + x2_ = self.deconv_layers2(x3_) + x2_ = self.adaption2(x2) + x2_ + + x1_ = self.deconv_layers3(x2_) + x1_ = self.adaption1(x1) + x1_ + + x0_ = self.deconv_layers4(x1_) + self.adaption0(x0) + x0_ = self.adaptionU1(x0_) + + ret = {} + for head in self.heads: + ret[head] = self.__getattr__(head)(x0_) + return [ret] + + def init_weights(self, num_layers, pretrained=True): + if pretrained: + for deconv_layer in [self.deconv_layers1,self.deconv_layers2,self.deconv_layers3]: + for _, m in deconv_layer.named_modules(): + if isinstance(m, nn.ConvTranspose2d): + nn.init.normal_(m.weight, std=0.001) + if self.deconv_with_bias: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + for head in self.heads: + final_layer = self.__getattr__(head) + for i, m in enumerate(final_layer.modules()): + if isinstance(m, nn.Conv2d): + if m.weight.shape[0] == self.heads[head]: + if 'hm' in head: + nn.init.constant_(m.bias, -2.19) + else: + nn.init.normal_(m.weight, std=0.001) + nn.init.constant_(m.bias, 0) + url = model_urls['resnet{}'.format(num_layers)] + pretrained_state_dict = model_zoo.load_url(url) + print('=> loading pretrained model {}'.format(url)) + self.load_state_dict(pretrained_state_dict, strict=False) + else: + print('=> imagenet pretrained model dose not exist') + print('=> please download it first') + raise ValueError('imagenet pretrained model does not exist') + + +resnet_spec = {18: (BasicBlock, [2, 2, 2, 2]), + 34: (BasicBlock, [3, 4, 6, 3]), + 50: (Bottleneck, [3, 4, 6, 3]), + 101: (Bottleneck, [3, 4, 23, 3]), + 152: (Bottleneck, [3, 8, 36, 3])} + + +def get_pose_net_fpn_mask(num_layers, heads, head_conv): + block_class, layers = resnet_spec[num_layers] + + model = PoseResNet(block_class, layers, heads, head_conv=head_conv) + model.init_weights(num_layers, pretrained=True) + return model diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_mask_resnet_half.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_mask_resnet_half.py new file mode 100644 index 00000000..983cbbc1 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_mask_resnet_half.py @@ -0,0 +1,460 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# Modified by Xingyi Zhou +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.model_zoo as model_zoo + +BN_MOMENTUM = 0.1 + +model_urls = { + 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', + 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', + 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', + 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', + 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', +} + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=0, bias=False) + +def save_map_g(x,name): + path = '/home/rujiao.lrj/CenterNet_4point_Mask_4_rotate_offset/src/save_map.txt' + f = open(path,'a+') + f.write('---------------%s-------------------\n'%name) + shape = list(x.shape) + if shape[0]>3: + c=3 + else: + c=shape[0] + if shape[1]>16: + w=16 + else: + w=shape[1] + if shape[2]>16: + h=16 + else: + h=shape[2] + for i in range(c): + for j in range(w): + f.write('cln:%d,line:%d\n'%(i,j)) + string = '' + for k in range(h): + string = string + str(x[i][j][k]) + ' ' + f.write(string+'\n') + +def pad_same(x,ksize,stride=1,Pool=False): + shape = x.shape + n_in,w,h = shape[1],shape[2],shape[3] + if h % stride == 0: + pad_along_height = max(ksize - stride, 0) + else: + pad_along_height = max(ksize - (h % stride), 0) + if w % stride == 0: + pad_along_width = max(ksize - stride, 0) + else: + pad_along_width = max(ksize - (w % stride), 0) + pad_bottom = pad_along_height // 2 + pad_top = pad_along_height - pad_bottom + pad_right = pad_along_width // 2 + pad_left = pad_along_width - pad_right + dim = (pad_left,pad_right,pad_top,pad_bottom) + if Pool: + dim = (pad_right,pad_left,pad_bottom,pad_top) + x = F.pad(x,dim,"constant",value=0) + return x + +class ChannelAttention(nn.Module): + def __init__(self, in_planes, ratio=16): + super(ChannelAttention, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.max_pool = nn.AdaptiveMaxPool2d(1) + + self.fc1 = nn.Conv2d(in_planes, in_planes // ratio, 1, bias=False) + self.relu1 = nn.ReLU() + self.fc2 = nn.Conv2d(in_planes // ratio, in_planes, 1, bias=False) + + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + avg_out = self.fc2(self.relu1(self.fc1(self.avg_pool(x)))) + max_out = self.fc2(self.relu1(self.fc1(self.max_pool(x)))) + + out = avg_out + max_out + + return self.sigmoid(out) + +class SpatialAttention(nn.Module): + def __init__(self): + super(SpatialAttention, self).__init__() + + self.conv1 = nn.Conv2d(2,1,3,padding=1,bias=False) + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + avg_out = torch.mean(x, dim=1, keepdim=True) + max_out,_ = torch.max(x, dim=1, keepdim=True) + x = torch.cat([avg_out,max_out],dim=1) + x = self.conv1(x) + return self.sigmoid(x) + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.downsample = downsample + self.stride = stride + self.planes = planes + + def forward(self, x): + shape = list(x.shape) + residual = x + x = pad_same(x,3,self.stride) + + out = self.conv1(x) + ''' + if self.planes==64: + if self.downsample is not None: + save_map_g(out.cpu().numpy()[0],'layer1.0.conv1') + else: + save_map_g(out.cpu().numpy()[0],'layer1.1.conv1') + ''' + out = self.bn1(out) + ''' + if self.planes==64: + if self.downsample is not None: + save_map_g(out.cpu().numpy()[0],'layer1.0.bn1') + else: + save_map_g(out.cpu().numpy()[0],'layer1.1.bn1') + ''' + out = self.relu(out) + ''' + if self.planes==64: + if self.downsample is not None: + save_map_g(out.cpu().numpy()[0],'layer1.0.relu') + else: + save_map_g(out.cpu().numpy()[0],'layer1.1.relu') + ''' + out = pad_same(out,3,1) + out = self.conv2(out) + ''' + if self.planes==64: + if self.downsample is not None: + save_map_g(out.cpu().numpy()[0],'layer1.0.conv2') + else: + save_map_g(out.cpu().numpy()[0],'layer1.1.conv2') + ''' + out = self.bn2(out) + ''' + if self.planes==64: + if self.downsample is not None: + save_map_g(out.cpu().numpy()[0],'layer1.0.bn2') + else: + save_map_g(out.cpu().numpy()[0],'layer1.1.bn2') + ''' + if self.downsample is not None: + residual = self.downsample(residual) + + out += residual + out = self.relu(out) + ''' + if self.planes==64: + if self.downsample is not None: + save_map_g(out.cpu().numpy()[0],'res2a.relu') + else: + save_map_g(out.cpu().numpy()[0],'res2b.relu') + ''' + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, + padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, + bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion, + momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class PoseResNet(nn.Module): + + def __init__(self, block, layers, heads, head_conv, **kwargs): + self.inplanes = 64 + self.deconv_with_bias = False + self.heads = heads + + super(PoseResNet, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=0, + bias=False) + self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=0) + self.layer1 = self._make_layer(block, 64, layers[0], stride=2) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 256, layers[3], stride=2) + + self.adaption3 = nn.Conv2d(256,256, kernel_size=1, stride=1, padding=0,bias=False) + self.adaption2 = nn.Conv2d(128,256, kernel_size=1, stride=1, padding=0,bias=False) + self.adaption1 = nn.Conv2d(64 ,256, kernel_size=1, stride=1, padding=0,bias=False) + self.adaption0 = nn.Conv2d(64 ,256, kernel_size=1, stride=1, padding=0,bias=False) + + self.adaptionU1 = nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0,bias=False) + + # used for deconv layers + self.deconv_layers1 = self._make_deconv_layer(1, [256], [4],) + self.deconv_layers2 = self._make_deconv_layer(1, [256], [4],) + self.deconv_layers3 = self._make_deconv_layer(1, [256], [4],) + self.deconv_layers4 = self._make_deconv_layer(1, [256], [4],) + + # self.final_layer = [] + + for head in sorted(self.heads): + num_output = self.heads[head] + if head_conv > 0: + inchannel = 256 + fc = nn.Sequential( + nn.Conv2d(inchannel, head_conv, + kernel_size=3, padding=1, bias=True), + nn.ReLU(inplace=True), + nn.Conv2d(head_conv, num_output, + kernel_size=1, stride=1, padding=0)) + else: + inchannel = 256 + fc = nn.Conv2d( + in_channels=inchannel, + out_channels=num_output, + kernel_size=1, + stride=1, + padding=0 + ) + self.__setattr__(head, fc) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def _get_deconv_cfg(self, deconv_kernel, index): + if deconv_kernel == 4: + padding = 1 + output_padding = 0 + elif deconv_kernel == 3: + padding = 1 + output_padding = 1 + elif deconv_kernel == 2: + padding = 0 + output_padding = 0 + elif deconv_kernel == 7: + padding = 3 + output_padding = 0 + + return deconv_kernel, padding, output_padding + + def _make_deconv_layer(self, num_layers, num_filters, num_kernels): + assert num_layers == len(num_filters), \ + 'ERROR: num_deconv_layers is different len(num_deconv_filters)' + assert num_layers == len(num_kernels), \ + 'ERROR: num_deconv_layers is different len(num_deconv_filters)' + + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i], i) + + planes = num_filters[i] + layers.append( + nn.ConvTranspose2d( + in_channels=self.inplanes, + out_channels=planes, + kernel_size=kernel, + stride=2, + padding=padding, + output_padding=output_padding, + bias=self.deconv_with_bias)) + layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)) + layers.append(nn.ReLU(inplace=True)) + self.inplanes = planes + + return nn.Sequential(*layers) + + def save_map(self,x,name): + path = '/home/rujiao.lrj/CenterNet_4point_Mask_4_rotate_offset/src/save_map.txt' + f = open(path,'a+') + f.write('---------------%s-------------------\n'%name) + shape = list(x.shape) + if shape[0]>3: + c=3 + else: + c=shape[0] + if shape[1]>16: + w=16 + else: + w=shape[1] + if shape[2]>16: + h=16 + else: + h=shape[2] + for i in range(c): + for j in range(w): + f.write('cln:%d,line:%d\n'%(i,j)) + string = '' + for k in range(h): + string = string + str(x[i][j][k]) + ' ' + f.write(string+'\n') + + def forward(self, x): + #self.save_map(x.cpu().numpy()[0],'input') + x = pad_same(x,7,2) + x = self.conv1(x) + #self.save_map(x.cpu().numpy()[0],'conv1') + x = self.bn1(x) + #self.save_map(x.cpu().numpy()[0],'bn1') + x = self.relu(x) + #self.save_map(x.cpu().numpy()[0],'conv1.relu') + x0 = pad_same(x,3,2,True) + x0 = self.maxpool(x0) + #self.save_map(x0.cpu().numpy()[0],'pool1') + x1 = self.layer1(x0) + #self.save_map(x1.cpu().numpy()[0],'res2b.relu') + x2 = self.layer2(x1) + #self.save_map(x2.cpu().numpy()[0],'res3b.relu') + x3 = self.layer3(x2) + #self.save_map(x3.cpu().numpy()[0],'res4b.relu') + x4 = self.layer4(x3) + #self.save_map(x4.cpu().numpy()[0],'res5b.relu') + + x3_ = self.deconv_layers1(x4) + #self.save_map(x3_.cpu().numpy()[0],'deconv_layers1.0') + x3_ = self.adaption3(x3) + x3_ + #self.save_map(x3_.cpu().numpy()[0],'res4b_') + + x2_ = self.deconv_layers2(x3_) + #self.save_map(x2_.cpu().numpy()[0],'deconv_layers2.0') + x2_ = self.adaption2(x2) + x2_ + #self.save_map(x2_.cpu().numpy()[0],'res3b_') + + x1_ = self.deconv_layers3(x2_) + #self.save_map(x1_.cpu().numpy()[0],'deconv_layers3.0') + x1_ = self.adaption1(x1) + x1_ + #self.save_map(x1_.cpu().numpy()[0],'res2b_') + + x0_ = self.deconv_layers4(x1_) + self.adaption0(x0) + x0_ = self.adaptionU1(x0_) + #self.save_map(x0_.cpu().numpy()[0],'adaptionU1') + + ret = {} + for head in self.heads: + ret[head] = self.__getattr__(head)(x0_) + return [ret] + + #for head in self.heads: + # self.save_map(ret[head].cpu().numpy()[0],head) + + def init_weights(self, num_layers, pretrained=True): + if pretrained: + for deconv_layer in [self.deconv_layers1,self.deconv_layers2,self.deconv_layers3]: + for _, m in deconv_layer.named_modules(): + if isinstance(m, nn.ConvTranspose2d): + nn.init.normal_(m.weight, std=0.001) + if self.deconv_with_bias: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + for head in self.heads: + final_layer = self.__getattr__(head) + for i, m in enumerate(final_layer.modules()): + if isinstance(m, nn.Conv2d): + if m.weight.shape[0] == self.heads[head]: + if 'hm' in head: + nn.init.constant_(m.bias, -2.19) + else: + nn.init.normal_(m.weight, std=0.001) + nn.init.constant_(m.bias, 0) + url = model_urls['resnet{}'.format(num_layers)] + pretrained_state_dict = model_zoo.load_url(url) + print('=> loading pretrained model {}'.format(url)) + #self.load_state_dict(pretrained_state_dict, strict=False) + else: + print('=> imagenet pretrained model dose not exist') + print('=> please download it first') + raise ValueError('imagenet pretrained model does not exist') + + +resnet_spec = {18: (BasicBlock, [2, 2, 2, 2]), + 34: (BasicBlock, [3, 4, 6, 3]), + 50: (Bottleneck, [3, 4, 6, 3]), + 101: (Bottleneck, [3, 4, 23, 3]), + 152: (Bottleneck, [3, 8, 36, 3])} + + +def get_pose_net_fpn_mask_half(num_layers, heads, head_conv): + block_class, layers = resnet_spec[num_layers] + + model = PoseResNet(block_class, layers, heads, head_conv=head_conv) + model.init_weights(num_layers, pretrained=True) + return model diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_resnet.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_resnet.py new file mode 100644 index 00000000..0abb92e9 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_resnet.py @@ -0,0 +1,279 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# Modified by Xingyi Zhou +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +import torch +import torch.nn as nn +import torch.utils.model_zoo as model_zoo + +BN_MOMENTUM = 0.1 + +model_urls = { + 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', + 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', + 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', + 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', + 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', +} + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, + padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, + bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion, + momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class PoseResNet(nn.Module): + + def __init__(self, block, layers, heads, head_conv, **kwargs): + self.inplanes = 64 + self.deconv_with_bias = False + self.heads = heads + + super(PoseResNet, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, + bias=False) + self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2) + self.adaption3 = nn.Conv2d(256,256, kernel_size=3, stride=1, padding=1,bias=False) + self.adaption2 = nn.Conv2d(128,256, kernel_size=3, stride=1, padding=1,bias=False) + self.adaption1 = nn.Conv2d(64 ,256, kernel_size=3, stride=1, padding=1,bias=False) + + # used for deconv layers + self.deconv_layers1 = self._make_deconv_layer(1, [256], [4],) + self.deconv_layers2 = self._make_deconv_layer(1, [256], [4],) + self.deconv_layers3 = self._make_deconv_layer(1, [256], [4],) + + # self.final_layer = [] + + for head in sorted(self.heads): + num_output = self.heads[head] + if head_conv > 0: + fc = nn.Sequential( + nn.Conv2d(256, head_conv, + kernel_size=3, padding=1, bias=True), + nn.ReLU(inplace=True), + nn.Conv2d(head_conv, num_output, + kernel_size=1, stride=1, padding=0)) + else: + fc = nn.Conv2d( + in_channels=256, + out_channels=num_output, + kernel_size=1, + stride=1, + padding=0 + ) + self.__setattr__(head, fc) + + # self.final_layer = nn.ModuleList(self.final_layer) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def _get_deconv_cfg(self, deconv_kernel, index): + if deconv_kernel == 4: + padding = 1 + output_padding = 0 + elif deconv_kernel == 3: + padding = 1 + output_padding = 1 + elif deconv_kernel == 2: + padding = 0 + output_padding = 0 + + return deconv_kernel, padding, output_padding + + def _make_deconv_layer(self, num_layers, num_filters, num_kernels): + assert num_layers == len(num_filters), \ + 'ERROR: num_deconv_layers is different len(num_deconv_filters)' + assert num_layers == len(num_kernels), \ + 'ERROR: num_deconv_layers is different len(num_deconv_filters)' + + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i], i) + + planes = num_filters[i] + layers.append( + nn.ConvTranspose2d( + in_channels=self.inplanes, + out_channels=planes, + kernel_size=kernel, + stride=2, + padding=padding, + output_padding=output_padding, + bias=self.deconv_with_bias)) + layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)) + layers.append(nn.ReLU(inplace=True)) + self.inplanes = planes + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + x1 = self.layer1(x) + x2 = self.layer2(x1) + x3 = self.layer3(x2) + x4 = self.layer4(x3) + + x3_ = self.deconv_layers1(x4) + x3_ = self.adaption3(x3) + x3_ + x2_ = self.deconv_layers2(x3_) + x2_ = self.adaption2(x2) + x2_ + x1_ = self.deconv_layers3(x2_) + x = self.adaption1(x1) + x1_ + #x = x1_ + ret = {} + for head in self.heads: + ret[head] = self.__getattr__(head)(x) + return [ret] + + def init_weights(self, num_layers, pretrained=True): + if pretrained: + for deconv_layer in [self.deconv_layers1,self.deconv_layers2,self.deconv_layers3]: + for _, m in deconv_layer.named_modules(): + if isinstance(m, nn.ConvTranspose2d): + nn.init.normal_(m.weight, std=0.001) + if self.deconv_with_bias: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + for head in self.heads: + final_layer = self.__getattr__(head) + for i, m in enumerate(final_layer.modules()): + if isinstance(m, nn.Conv2d): + if m.weight.shape[0] == self.heads[head]: + if 'hm' in head: + nn.init.constant_(m.bias, -2.19) + else: + nn.init.normal_(m.weight, std=0.001) + nn.init.constant_(m.bias, 0) + url = model_urls['resnet{}'.format(num_layers)] + pretrained_state_dict = model_zoo.load_url(url) + print('=> loading pretrained model {}'.format(url)) + self.load_state_dict(pretrained_state_dict, strict=False) + else: + print('=> imagenet pretrained model dose not exist') + print('=> please download it first') + raise ValueError('imagenet pretrained model does not exist') + + +resnet_spec = {18: (BasicBlock, [2, 2, 2, 2]), + 34: (BasicBlock, [3, 4, 6, 3]), + 50: (Bottleneck, [3, 4, 6, 3]), + 101: (Bottleneck, [3, 4, 23, 3]), + 152: (Bottleneck, [3, 8, 36, 3])} + + +def get_pose_net_fpn(num_layers, heads, head_conv): + block_class, layers = resnet_spec[num_layers] + + model = PoseResNet(block_class, layers, heads, head_conv=head_conv) + model.init_weights(num_layers, pretrained=True) + return model diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_resnet_half.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_resnet_half.py new file mode 100644 index 00000000..4b6acdad --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_resnet_half.py @@ -0,0 +1,403 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# Modified by Xingyi Zhou +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.model_zoo as model_zoo + +BN_MOMENTUM = 0.1 + +model_urls = { + 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', + 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', + 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', + 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', + 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', +} + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=0, bias=False) + +class ChannelAttention(nn.Module): + def __init__(self, in_planes, ratio=16): + super(ChannelAttention, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.max_pool = nn.AdaptiveMaxPool2d(1) + + self.fc1 = nn.Conv2d(in_planes, in_planes // ratio, 1, bias=False) + self.relu1 = nn.ReLU() + self.fc2 = nn.Conv2d(in_planes // ratio, in_planes, 1, bias=False) + + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + avg_out = self.fc2(self.relu1(self.fc1(self.avg_pool(x)))) + max_out = self.fc2(self.relu1(self.fc1(self.max_pool(x)))) + + out = avg_out + max_out + + return self.sigmoid(out) + +class SpatialAttention(nn.Module): + def __init__(self): + super(SpatialAttention, self).__init__() + + self.conv1 = nn.Conv2d(2,1,3,padding=1,bias=False) + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + avg_out = torch.mean(x, dim=1, keepdim=True) + max_out,_ = torch.max(x, dim=1, keepdim=True) + x = torch.cat([avg_out,max_out],dim=1) + x = self.conv1(x) + return self.sigmoid(x) + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3,stride=stride, padding=1) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.downsample = downsample + self.stride = stride + self.planes = planes + + def forward(self, x): + shape = list(x.shape) + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + out = self.conv2(out) + out = self.bn2(out) + if self.downsample is not None: + residual = self.downsample(residual) + + out += residual + out = self.relu(out) + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, + padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, + bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion, + momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class PoseResNet(nn.Module): + + def __init__(self, block, layers, heads, head_conv, **kwargs): + self.inplanes = 64 + self.deconv_with_bias = False + self.heads = heads + + super(PoseResNet, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, + bias=False) + self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0], stride=2) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 256, layers[3], stride=2) + + self.adaption3 = nn.Conv2d(256,256, kernel_size=1, stride=1, padding=0,bias=False) + self.adaption2 = nn.Conv2d(128,256, kernel_size=1, stride=1, padding=0,bias=False) + self.adaption1 = nn.Conv2d(64 ,256, kernel_size=1, stride=1, padding=0,bias=False) + self.adaption0 = nn.Conv2d(64 ,256, kernel_size=1, stride=1, padding=0,bias=False) + + self.adaptionU1 = nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0,bias=False) + + # used for deconv layers + self.deconv_layers1 = self._make_deconv_layer(1, [256], [4],) #nn.Upsample(scale_factor=2, mode='bilinear',align_corners=False) + self.deconv_layers2 = self._make_deconv_layer(1, [256], [4],) #nn.Upsample(scale_factor=2, mode='bilinear',align_corners=False)# + self.deconv_layers3 = self._make_deconv_layer(1, [256], [4],) #nn.Upsample(scale_factor=2, mode='bilinear',align_corners=False)# + self.deconv_layers4 = self._make_deconv_layer(1, [256], [4],) #nn.Upsample(scale_factor=2, mode='bilinear',align_corners=False)# + + self.hm_maxpool = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) + self.hm_sigmoid = nn.Sigmoid() + self.mk_maxpool = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) + self.mk_sigmoid = nn.Sigmoid() + #self.hm_sig_for_pool = nn.Sigmoid() + + #self.cls = nn.Sequential(nn.Conv2d(256, 64,kernel_size=3, padding=1, bias=True), + # nn.ReLU(inplace=True), + # nn.Conv2d(64, 4, kernel_size=1, stride=1, padding=0)) + + # self.final_layer = [] + for head in sorted(self.heads): + num_output = self.heads[head] + if head_conv > 0 and (head=='reg' or head=='mk_reg'): + inchannel = 256 + fc = nn.Sequential( + nn.Conv2d(inchannel, head_conv, + kernel_size=3, padding=1, bias=True), + nn.ReLU(inplace=True), + nn.Conv2d(head_conv, num_output, + kernel_size=1, stride=1, padding=0)) + elif head_conv > 0: + inchannel = 256 + fc = nn.Sequential( + nn.Conv2d(inchannel, head_conv, kernel_size=3, padding=1, bias=True), + nn.ReLU(inplace=True), + nn.Conv2d(head_conv, head_conv, kernel_size=3, padding=1, bias=True), + nn.ReLU(inplace=True), + nn.Conv2d(head_conv, head_conv, kernel_size=3, padding=1, bias=True), + nn.ReLU(inplace=True), + nn.Conv2d(head_conv, head_conv, kernel_size=3, padding=1, bias=True), + nn.ReLU(inplace=True), + nn.Conv2d(head_conv, num_output,kernel_size=1, stride=1, padding=0)) + else: + inchannel = 256 + fc = nn.Conv2d( + in_channels=inchannel, + out_channels=num_output, + kernel_size=1, + stride=1, + padding=0 + ) + self.__setattr__(head, fc) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def _get_deconv_cfg(self, deconv_kernel, index): + if deconv_kernel == 4: + padding = 1 + output_padding = 0 + elif deconv_kernel == 3: + padding = 1 + output_padding = 1 + elif deconv_kernel == 2: + padding = 0 + output_padding = 0 + elif deconv_kernel == 7: + padding = 3 + output_padding = 0 + + return deconv_kernel, padding, output_padding + + def _make_deconv_layer(self, num_layers, num_filters, num_kernels): + assert num_layers == len(num_filters), \ + 'ERROR: num_deconv_layers is different len(num_deconv_filters)' + assert num_layers == len(num_kernels), \ + 'ERROR: num_deconv_layers is different len(num_deconv_filters)' + + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i], i) + + planes = num_filters[i] + layers.append( + nn.ConvTranspose2d( + in_channels=self.inplanes, + out_channels=planes, + kernel_size=kernel, + stride=2, + padding=padding, + output_padding=output_padding, + bias=self.deconv_with_bias)) + layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)) + layers.append(nn.ReLU(inplace=True)) + self.inplanes = planes + + return nn.Sequential(*layers) + + def save_map(self,X,name): + x = X[0].data.cpu().numpy() + print(x.shape) + path = '/home/rujiao.lrj/CenterNet_huntie/src/%s.txt'%name + f = open(path,'w') + #f.write('---------------%s-------------------\n'%name) + f.write(name+'\n') + f.write(str(1)+' '+str(x.shape[0])+' '+str(x.shape[1])+' '+str(x.shape[2])+'\n') + shape = list(x.shape) + if shape[0]>3: + c=3 + else: + c=shape[0] + if shape[1]>16: + w=16 + else: + w=shape[1] + if shape[2]>16: + h=16 + else: + h=shape[2] + c,h,w = x.shape + for i in range(c): + for j in range(h): + #f.write('cln:%d,line:%d\n'%(i,j)) + #string = '' + for k in range(w): + #string = string + str(x[i][j][k]) + ' ' + f.write(str(x[i][j][k])+' ') + #f.write(string+'\n') + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x0 = self.maxpool(x) + x1 = self.layer1(x0) + x2 = self.layer2(x1) + x3 = self.layer3(x2) + x4 = self.layer4(x3) + + x3_ = self.deconv_layers1(x4) + x3_ = self.adaption3(x3) + x3_ + + x2_ = self.deconv_layers2(x3_) + x2_ = self.adaption2(x2) + x2_ + + x1_ = self.deconv_layers3(x2_) + x1_ = self.adaption1(x1) + x1_ + + x0_ = self.deconv_layers4(x1_) + self.adaption0(x0) + x0_ = self.adaptionU1(x0_) + + ret = {} + + # leave it alone is ok + #xcls = x0_ + #for name, midlayer in self.cls._modules.items(): + # xcls = midlayer(xcls) + #ret['cls'] = xcls + + #training version + for head in self.heads: + ret[head] = self.__getattr__(head)(x0_) + + #onnx version + # hm = self.__getattr__('hm')(x0_) + # wh = self.__getattr__('wh')(x0_) + # st = self.__getattr__('st')(x0_) + # reg = self.__getattr__('reg')(x0_) + # cr = self.__getattr__('cr')(x0_) + # ax = self.__getattr__('ax')(x0_) + + + # ret['hm_sigmoid'] = self.hm_sigmoid(hm) + # ret['hm_maxpool'] = self.hm_maxpool(ret['hm_sigmoid']) + # ret['wh'] = wh + # ret['st'] = st + # ret['ax'] = ax + # ret['cr']= cr + # ret['reg']= reg + + + return [ret]#ret['hm'], ret['st'], ret['wh'], ret['ax'], ret['cr'], ret['reg']#[ret] + + def init_weights(self, num_layers, pretrained=True): + if pretrained: + for deconv_layer in [self.deconv_layers1,self.deconv_layers2,self.deconv_layers3]: + for _, m in deconv_layer.named_modules(): + if isinstance(m, nn.ConvTranspose2d): + nn.init.normal_(m.weight, std=0.001) + if self.deconv_with_bias: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + for head in self.heads: + final_layer = self.__getattr__(head) + for i, m in enumerate(final_layer.modules()): + if isinstance(m, nn.Conv2d): + if m.weight.shape[0] == self.heads[head]: + if 'hm' in head: + nn.init.constant_(m.bias, -2.19) + else: + nn.init.normal_(m.weight, std=0.001) + nn.init.constant_(m.bias, 0) + url = model_urls['resnet{}'.format(num_layers)] + #pretrained_state_dict = model_zoo.load_url(url) + #print('=> loading pretrained model {}'.format(url)) + #self.load_state_dict(pretrained_state_dict, strict=False) + else: + print('=> imagenet pretrained model dose not exist') + print('=> please download it first') + raise ValueError('imagenet pretrained model does not exist') + + +resnet_spec = {18: (BasicBlock, [2, 2, 2, 2]), + 34: (BasicBlock, [3, 4, 6, 3]), + 50: (Bottleneck, [3, 4, 6, 3]), + 101: (Bottleneck, [3, 4, 23, 3]), + 152: (Bottleneck, [3, 8, 36, 3])} + + +def get_pose_net_fpn_half(num_layers, heads, head_conv): + block_class, layers = resnet_spec[num_layers] + + model = PoseResNet(block_class, layers, heads, head_conv=head_conv) + model.init_weights(num_layers, pretrained=True) + return model diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/pose_dla_dcn.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/pose_dla_dcn.py new file mode 100644 index 00000000..0f761776 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/pose_dla_dcn.py @@ -0,0 +1,559 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import math +import logging +import numpy as np +from os.path import join + +import torch +from torch import nn +import torch.nn.functional as F +import torch.utils.model_zoo as model_zoo +import torchvision.ops +from torch.nn.modules.utils import _pair + +# from .DCNv2.dcn_v2 import DCN +#from .dcn.modules.deform_conv import ModulatedDeformConvPack as DCN +# from mmcv.ops import ModulatedDeformConv2dPack as DCN +BN_MOMENTUM = 0.1 +logger = logging.getLogger(__name__) + +def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'): + return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash)) + + +def conv3x3(in_planes, out_planes, stride=1): + "3x3 convolution with padding" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +class BasicBlock(nn.Module): + def __init__(self, inplanes, planes, stride=1, dilation=1): + super(BasicBlock, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, + stride=stride, padding=dilation, + bias=False, dilation=dilation) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, + stride=1, padding=dilation, + bias=False, dilation=dilation) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.stride = stride + + def forward(self, x, residual=None): + if residual is None: + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 2 + + def __init__(self, inplanes, planes, stride=1, dilation=1): + super(Bottleneck, self).__init__() + expansion = Bottleneck.expansion + bottle_planes = planes // expansion + self.conv1 = nn.Conv2d(inplanes, bottle_planes, + kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3, + stride=stride, padding=dilation, + bias=False, dilation=dilation) + self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) + self.conv3 = nn.Conv2d(bottle_planes, planes, + kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.stride = stride + + def forward(self, x, residual=None): + if residual is None: + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + out += residual + out = self.relu(out) + + return out + + +class BottleneckX(nn.Module): + expansion = 2 + cardinality = 32 + + def __init__(self, inplanes, planes, stride=1, dilation=1): + super(BottleneckX, self).__init__() + cardinality = BottleneckX.cardinality + # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0))) + # bottle_planes = dim * cardinality + bottle_planes = planes * cardinality // 32 + self.conv1 = nn.Conv2d(inplanes, bottle_planes, + kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3, + stride=stride, padding=dilation, bias=False, + dilation=dilation, groups=cardinality) + self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) + self.conv3 = nn.Conv2d(bottle_planes, planes, + kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.stride = stride + + def forward(self, x, residual=None): + if residual is None: + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + out += residual + out = self.relu(out) + + return out + + +class Root(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, residual): + super(Root, self).__init__() + self.conv = nn.Conv2d( + in_channels, out_channels, 1, + stride=1, bias=False, padding=(kernel_size - 1) // 2) + self.bn = nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.residual = residual + + def forward(self, *x): + children = x + x = self.conv(torch.cat(x, 1)) + x = self.bn(x) + if self.residual: + x += children[0] + x = self.relu(x) + + return x + + +class Tree(nn.Module): + def __init__(self, levels, block, in_channels, out_channels, stride=1, + level_root=False, root_dim=0, root_kernel_size=1, + dilation=1, root_residual=False): + super(Tree, self).__init__() + if root_dim == 0: + root_dim = 2 * out_channels + if level_root: + root_dim += in_channels + if levels == 1: + self.tree1 = block(in_channels, out_channels, stride, + dilation=dilation) + self.tree2 = block(out_channels, out_channels, 1, + dilation=dilation) + else: + self.tree1 = Tree(levels - 1, block, in_channels, out_channels, + stride, root_dim=0, + root_kernel_size=root_kernel_size, + dilation=dilation, root_residual=root_residual) + self.tree2 = Tree(levels - 1, block, out_channels, out_channels, + root_dim=root_dim + out_channels, + root_kernel_size=root_kernel_size, + dilation=dilation, root_residual=root_residual) + if levels == 1: + self.root = Root(root_dim, out_channels, root_kernel_size, + root_residual) + self.level_root = level_root + self.root_dim = root_dim + self.downsample = None + self.project = None + self.levels = levels + if stride > 1: + self.downsample = nn.MaxPool2d(stride, stride=stride) + if in_channels != out_channels: + self.project = nn.Sequential( + nn.Conv2d(in_channels, out_channels, + kernel_size=1, stride=1, bias=False), + nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM) + ) + + def forward(self, x, residual=None, children=None): + children = [] if children is None else children + bottom = self.downsample(x) if self.downsample else x + residual = self.project(bottom) if self.project else bottom + if self.level_root: + children.append(bottom) + x1 = self.tree1(x, residual) + if self.levels == 1: + x2 = self.tree2(x1) + x = self.root(x2, x1, *children) + else: + children.append(x1) + x = self.tree2(x1, children=children) + return x + + +class DLA(nn.Module): + def __init__(self, levels, channels, num_classes=1000, + block=BasicBlock, residual_root=False, linear_root=False): + super(DLA, self).__init__() + self.channels = channels + self.num_classes = num_classes + self.base_layer = nn.Sequential( + nn.Conv2d(3, channels[0], kernel_size=7, stride=1, + padding=3, bias=False), + nn.BatchNorm2d(channels[0], momentum=BN_MOMENTUM), + nn.ReLU(inplace=True)) + self.level0 = self._make_conv_level( + channels[0], channels[0], levels[0]) + self.level1 = self._make_conv_level( + channels[0], channels[1], levels[1], stride=2) + self.level2 = Tree(levels[2], block, channels[1], channels[2], 2, + level_root=False, + root_residual=residual_root) + self.level3 = Tree(levels[3], block, channels[2], channels[3], 2, + level_root=True, root_residual=residual_root) + self.level4 = Tree(levels[4], block, channels[3], channels[4], 2, + level_root=True, root_residual=residual_root) + self.level5 = Tree(levels[5], block, channels[4], channels[5], 2, + level_root=True, root_residual=residual_root) + + # for m in self.modules(): + # if isinstance(m, nn.Conv2d): + # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + # m.weight.data.normal_(0, math.sqrt(2. / n)) + # elif isinstance(m, nn.BatchNorm2d): + # m.weight.data.fill_(1) + # m.bias.data.zero_() + + def _make_level(self, block, inplanes, planes, blocks, stride=1): + downsample = None + if stride != 1 or inplanes != planes: + downsample = nn.Sequential( + nn.MaxPool2d(stride, stride=stride), + nn.Conv2d(inplanes, planes, + kernel_size=1, stride=1, bias=False), + nn.BatchNorm2d(planes, momentum=BN_MOMENTUM), + ) + + layers = [] + layers.append(block(inplanes, planes, stride, downsample=downsample)) + for i in range(1, blocks): + layers.append(block(inplanes, planes)) + + return nn.Sequential(*layers) + + def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1): + modules = [] + for i in range(convs): + modules.extend([ + nn.Conv2d(inplanes, planes, kernel_size=3, + stride=stride if i == 0 else 1, + padding=dilation, bias=False, dilation=dilation), + nn.BatchNorm2d(planes, momentum=BN_MOMENTUM), + nn.ReLU(inplace=True)]) + inplanes = planes + return nn.Sequential(*modules) + + def forward(self, x): + y = [] + x = self.base_layer(x) + for i in range(6): + x = getattr(self, 'level{}'.format(i))(x) + y.append(x) + return y + + def load_pretrained_model(self, data='imagenet', name='dla34', hash='ba72cf86'): + # fc = self.fc + print('****load backbone') + if name.endswith('.pth'): + model_weights = torch.load(data + name) + else: + model_url = get_model_url(data, name, hash) + model_weights = model_zoo.load_url(model_url) + num_classes = len(model_weights[list(model_weights.keys())[-1]]) + self.fc = nn.Conv2d( + self.channels[-1], num_classes, + kernel_size=1, stride=1, padding=0, bias=True) + self.load_state_dict(model_weights) + # self.fc = fc + + +def dla34(pretrained=True, **kwargs): # DLA-34 + model = DLA([1, 1, 1, 2, 2, 1], + [16, 32, 64, 128, 256, 512], + block=BasicBlock, **kwargs) + if pretrained: + model.load_pretrained_model(data='imagenet', name='dla34', hash='ba72cf86') + return model + +class Identity(nn.Module): + + def __init__(self): + super(Identity, self).__init__() + + def forward(self, x): + return x + + +def fill_fc_weights(layers): + for m in layers.modules(): + if isinstance(m, nn.Conv2d): + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + +def fill_up_weights(up): + w = up.weight.data + f = math.ceil(w.size(2) / 2) + c = (2 * f - 1 - f % 2) / (2. * f) + for i in range(w.size(2)): + for j in range(w.size(3)): + w[0, 0, i, j] = \ + (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) + for c in range(1, w.size(0)): + w[c, 0, :, :] = w[0, 0, :, :] + + +class DeformConv(nn.Module): + def __init__(self, chi, cho): + super(DeformConv, self).__init__() + self.actf = nn.Sequential( + nn.BatchNorm2d(cho, momentum=BN_MOMENTUM), + nn.ReLU(inplace=True) + ) + self.conv = DCN(chi, cho, kernel_size=(3,3), stride=1, padding=1, dilation=1, deformable_groups=1) + + def forward(self, x): + x = self.conv(x) + x = self.actf(x) + return x + + +class DCN(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + deformable_groups, + ): + + super(DCN, self).__init__() + + self.in_channels = in_channels + self.kernel_size = _pair(kernel_size) + self.stride = _pair(stride) + self.padding = _pair(padding) + self.dilation = _pair(dilation) + self.deformable_groups = deformable_groups + + self.conv_offset_mask = nn.Conv2d(in_channels, + 3 * self.kernel_size[0] * self.kernel_size[1], + kernel_size=self.kernel_size, + stride=self.stride, + padding=self.padding, + bias=True) + + nn.init.constant_(self.conv_offset_mask.weight, 0.) + nn.init.constant_(self.conv_offset_mask.bias, 0.) + + self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels, *self.kernel_size)) + self.bias = nn.Parameter(torch.Tensor(out_channels)) + + self.reset_parameters() + + def reset_parameters(self): + n = self.in_channels + for k in self.kernel_size: + n *= k + stdv = 1.0 / math.sqrt(n) + self.weight.data.uniform_(-stdv, stdv) + self.bias.data.zero_() + + def forward(self, x): + + out = self.conv_offset_mask(x) + o1, o2, mask = torch.chunk(out, 3, dim=1) + offset = torch.cat((o1, o2), dim=1) + mask = torch.sigmoid(mask) + + x = torchvision.ops.deform_conv2d(input=x, + offset=offset, + weight=self.weight, + bias=self.bias, + padding=self.padding, + stride=self.stride, + mask=mask, + ) + return x + + +class IDAUp(nn.Module): + + def __init__(self, o, channels, up_f): + super(IDAUp, self).__init__() + for i in range(1, len(channels)): + c = channels[i] + f = int(up_f[i]) + proj = DeformConv(c, o) + node = DeformConv(o, o) + + up = nn.ConvTranspose2d(o, o, f * 2, stride=f, + padding=f // 2, output_padding=0, + groups=o, bias=False) + fill_up_weights(up) + + setattr(self, 'proj_' + str(i), proj) + setattr(self, 'up_' + str(i), up) + setattr(self, 'node_' + str(i), node) + + + def forward(self, layers, startp, endp): + for i in range(startp + 1, endp): + upsample = getattr(self, 'up_' + str(i - startp)) + project = getattr(self, 'proj_' + str(i - startp)) + layers[i] = upsample(project(layers[i])) + node = getattr(self, 'node_' + str(i - startp)) + layers[i] = node(layers[i] + layers[i - 1]) + + + +class DLAUp(nn.Module): + def __init__(self, startp, channels, scales, in_channels=None): + super(DLAUp, self).__init__() + self.startp = startp + if in_channels is None: + in_channels = channels + self.channels = channels + channels = list(channels) + scales = np.array(scales, dtype=int) + for i in range(len(channels) - 1): + j = -i - 2 + setattr(self, 'ida_{}'.format(i), + IDAUp(channels[j], in_channels[j:], + scales[j:] // scales[j])) + scales[j + 1:] = scales[j] + in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]] + + def forward(self, layers): + out = [layers[-1]] # start with 32 + for i in range(len(layers) - self.startp - 1): + ida = getattr(self, 'ida_{}'.format(i)) + ida(layers, len(layers) -i - 2, len(layers)) + out.insert(0, layers[-1]) + return out + + +class Interpolate(nn.Module): + def __init__(self, scale, mode): + super(Interpolate, self).__init__() + self.scale = scale + self.mode = mode + + def forward(self, x): + x = F.interpolate(x, scale_factor=self.scale, mode=self.mode, align_corners=False) + return x + + +class DLASeg(nn.Module): + def __init__(self, base_name, heads, pretrained, down_ratio, final_kernel, + last_level, head_conv, out_channel=0): + super(DLASeg, self).__init__() + assert down_ratio in [2, 4, 8, 16] + self.first_level = int(np.log2(down_ratio)) + self.last_level = last_level + print('****identify network') + self.base = globals()[base_name](pretrained=pretrained) + channels = self.base.channels + scales = [2 ** i for i in range(len(channels[self.first_level:]))] + self.dla_up = DLAUp(self.first_level, channels[self.first_level:], scales) + + if out_channel == 0: + out_channel = channels[self.first_level] + + self.ida_up = IDAUp(out_channel, channels[self.first_level:self.last_level], + [2 ** i for i in range(self.last_level - self.first_level)]) + + self.heads = heads + for head in self.heads: + classes = self.heads[head] + if head_conv > 0: + fc = nn.Sequential( + nn.Conv2d(channels[self.first_level], head_conv, + kernel_size=3, padding=1, bias=True), + nn.ReLU(inplace=True), + nn.Conv2d(head_conv, classes, + kernel_size=final_kernel, stride=1, + padding=final_kernel // 2, bias=True)) + if 'hm' in head: + fc[-1].bias.data.fill_(-2.19) + else: + fill_fc_weights(fc) + else: + fc = nn.Conv2d(channels[self.first_level], classes, + kernel_size=final_kernel, stride=1, + padding=final_kernel // 2, bias=True) + if 'hm' in head: + fc.bias.data.fill_(-2.19) + else: + fill_fc_weights(fc) + self.__setattr__(head, fc) + + def forward(self, x): + x = self.base(x) + x = self.dla_up(x) + + y = [] + for i in range(self.last_level - self.first_level): + y.append(x[i].clone()) + self.ida_up(y, 0, len(y)) + + z = {} + for head in self.heads: + z[head] = self.__getattr__(head)(y[-1]) + return z['hm'], z['st'], z['wh'], z['ax'], z['cr'], z['reg']#, y[-1] + + +def get_pose_net(num_layers, heads, head_conv=256, down_ratio=4): + model = DLASeg('dla{}'.format(num_layers), heads, + pretrained=True, + down_ratio=down_ratio, + final_kernel=1, + last_level=5, + head_conv=head_conv) + return model + diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/resnet_dcn.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/resnet_dcn.py new file mode 100644 index 00000000..805c2c3b --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/resnet_dcn.py @@ -0,0 +1,290 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# Modified by Dequan Wang and Xingyi Zhou +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import math +import logging + +import torch +import torch.nn as nn +from .DCNv2.dcn_v2 import DCN +import torch.utils.model_zoo as model_zoo + +BN_MOMENTUM = 0.1 +logger = logging.getLogger(__name__) + +model_urls = { + 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', + 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', + 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', + 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', + 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', +} + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, + padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, + bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion, + momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + +def fill_up_weights(up): + w = up.weight.data + f = math.ceil(w.size(2) / 2) + c = (2 * f - 1 - f % 2) / (2. * f) + for i in range(w.size(2)): + for j in range(w.size(3)): + w[0, 0, i, j] = \ + (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) + for c in range(1, w.size(0)): + w[c, 0, :, :] = w[0, 0, :, :] + +def fill_fc_weights(layers): + for m in layers.modules(): + if isinstance(m, nn.Conv2d): + nn.init.normal_(m.weight, std=0.001) + # torch.nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu') + # torch.nn.init.xavier_normal_(m.weight.data) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + +class PoseResNet(nn.Module): + + def __init__(self, block, layers, heads, head_conv): + self.inplanes = 64 + self.heads = heads + self.deconv_with_bias = False + + super(PoseResNet, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, + bias=False) + self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2) + + # used for deconv layers + self.deconv_layers = self._make_deconv_layer( + 3, + [256, 128, 64], + [4, 4, 4], + ) + + for head in self.heads: + classes = self.heads[head] + if head_conv > 0: + fc = nn.Sequential( + nn.Conv2d(64, head_conv, + kernel_size=3, padding=1, bias=True), + nn.ReLU(inplace=True), + nn.Conv2d(head_conv, classes, + kernel_size=1, stride=1, + padding=0, bias=True)) + if 'hm' in head: + fc[-1].bias.data.fill_(-2.19) + else: + fill_fc_weights(fc) + else: + fc = nn.Conv2d(64, classes, + kernel_size=1, stride=1, + padding=0, bias=True) + if 'hm' in head: + fc.bias.data.fill_(-2.19) + else: + fill_fc_weights(fc) + self.__setattr__(head, fc) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def _get_deconv_cfg(self, deconv_kernel, index): + if deconv_kernel == 4: + padding = 1 + output_padding = 0 + elif deconv_kernel == 3: + padding = 1 + output_padding = 1 + elif deconv_kernel == 2: + padding = 0 + output_padding = 0 + + return deconv_kernel, padding, output_padding + + def _make_deconv_layer(self, num_layers, num_filters, num_kernels): + assert num_layers == len(num_filters), \ + 'ERROR: num_deconv_layers is different len(num_deconv_filters)' + assert num_layers == len(num_kernels), \ + 'ERROR: num_deconv_layers is different len(num_deconv_filters)' + + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i], i) + + planes = num_filters[i] + fc = DCN(self.inplanes, planes, + kernel_size=(3,3), stride=1, + padding=1, dilation=1, deformable_groups=1) + # fc = nn.Conv2d(self.inplanes, planes, + # kernel_size=3, stride=1, + # padding=1, dilation=1, bias=False) + # fill_fc_weights(fc) + up = nn.ConvTranspose2d( + in_channels=planes, + out_channels=planes, + kernel_size=kernel, + stride=2, + padding=padding, + output_padding=output_padding, + bias=self.deconv_with_bias) + fill_up_weights(up) + + layers.append(fc) + layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)) + layers.append(nn.ReLU(inplace=True)) + layers.append(up) + layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)) + layers.append(nn.ReLU(inplace=True)) + self.inplanes = planes + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + x = self.deconv_layers(x) + ret = {} + for head in self.heads: + ret[head] = self.__getattr__(head)(x) + return [ret] + + def init_weights(self, num_layers): + if 1: + url = model_urls['resnet{}'.format(num_layers)] + pretrained_state_dict = model_zoo.load_url(url) + print('=> loading pretrained model {}'.format(url)) + self.load_state_dict(pretrained_state_dict, strict=False) + print('=> init deconv weights from normal distribution') + for name, m in self.deconv_layers.named_modules(): + if isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + +resnet_spec = {18: (BasicBlock, [2, 2, 2, 2]), + 34: (BasicBlock, [3, 4, 6, 3]), + 50: (Bottleneck, [3, 4, 6, 3]), + 101: (Bottleneck, [3, 4, 23, 3]), + 152: (Bottleneck, [3, 8, 36, 3])} + + +def get_pose_net(num_layers, heads, head_conv=256): + block_class, layers = resnet_spec[num_layers] + + model = PoseResNet(block_class, layers, heads, head_conv=head_conv) + model.init_weights(num_layers) + return model diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/scatter_gather.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/scatter_gather.py new file mode 100644 index 00000000..9a460584 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/scatter_gather.py @@ -0,0 +1,38 @@ +import torch +from torch.autograd import Variable +from torch.nn.parallel._functions import Scatter, Gather + + +def scatter(inputs, target_gpus, dim=0, chunk_sizes=None): + r""" + Slices variables into approximately equal chunks and + distributes them across given GPUs. Duplicates + references to objects that are not variables. Does not + support Tensors. + """ + def scatter_map(obj): + if isinstance(obj, Variable): + return Scatter.apply(target_gpus, chunk_sizes, dim, obj) + assert not torch.is_tensor(obj), "Tensors not supported in scatter." + if isinstance(obj, tuple): + return list(zip(*map(scatter_map, obj))) + if isinstance(obj, list): + return list(map(list, zip(*map(scatter_map, obj)))) + if isinstance(obj, dict): + return list(map(type(obj), zip(*map(scatter_map, obj.items())))) + return [obj for targets in target_gpus] + + return scatter_map(inputs) + + +def scatter_kwargs(inputs, kwargs, target_gpus, dim=0, chunk_sizes=None): + r"""Scatter with support for kwargs dictionary""" + inputs = scatter(inputs, target_gpus, dim, chunk_sizes) if inputs else [] + kwargs = scatter(kwargs, target_gpus, dim, chunk_sizes) if kwargs else [] + if len(inputs) < len(kwargs): + inputs.extend([() for _ in range(len(kwargs) - len(inputs))]) + elif len(kwargs) < len(inputs): + kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))]) + inputs = tuple(inputs) + kwargs = tuple(kwargs) + return inputs, kwargs diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/transformer.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/transformer.py new file mode 100644 index 00000000..92e88e7e --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/transformer.py @@ -0,0 +1,305 @@ +import torch +import torch.nn as nn +import copy +import math +from torch.autograd import Variable +import torch.nn.functional as F + +def get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + +class Encoder(nn.Module): + def __init__(self, input_size, hidden_size, N, heads, dropout): + super().__init__() + self.N = N + self.pe = PositionalEncoder(hidden_size, dropout=dropout) + self.layers = get_clones(EncoderLayer(hidden_size, heads, dropout), N) + self.norm = Norm(hidden_size) + def forward(self, x, mask = None, require_att = False): + #x = self.pe(x) + att = None + for i in range(self.N): + if mask is None: + if i == (self.N - 1): + x, att = self.layers[i](x, require_att = True) + else: + x = self.layers[i](x) + else: + x = self.layers[i](x, mask) + if require_att: + return x, att + else: + return x + +class Decoder(nn.Module): + def __init__(self, hidden_size, output_size): + super(Decoder, self).__init__() + self.linear = nn.Sequential( + nn.Linear(hidden_size, hidden_size), + nn.ReLU(inplace=True), + nn.Linear(hidden_size, output_size), + nn.ReLU(inplace=True) #newly added + ) + + def forward(self, x): + out = self.linear(x) + return out + +class Transformer(nn.Module): + def __init__(self, input_size, hidden_size, output_size, n_layers, heads, dropout): + super().__init__() + self.linear = nn.Linear(input_size, hidden_size) + self.encoder = Encoder(input_size, hidden_size, n_layers, heads, dropout) + self.decoder = Decoder(hidden_size, output_size) + def forward(self, x, mask = None, require_att = False): + x = self.linear(x) + att = None + if mask is None: + #evaluation model + if require_att: + embedding, att = self.encoder(x, require_att = True) + else: + embedding = self.encoder(x) + + output = self.decoder(embedding) + + if require_att: + return output, att + else: + return output#, att + else: + if require_att : + embedding, att = self.encoder(x, mask, require_att = True) + else: + embedding = self.encoder(x, mask) + + output = self.decoder(embedding) + return output + +def get_model(opt, src_vocab, trg_vocab): + assert opt.d_model % opt.heads == 0 + assert opt.dropout < 1 + model = Transformer(src_vocab, trg_vocab, opt.d_model, opt.n_layers, opt.heads, opt.dropout) + + if opt.load_weights is not None: + print("loading pretrained weights...") + model.load_state_dict(torch.load(f'{opt.load_weights}/model_weights')) + else: + for p in model.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + if opt.device == 0: + model = model.cuda() + + return model + +# Sublayer +class Norm(nn.Module): + def __init__(self, d_model, eps = 1e-6): + super().__init__() + + self.size = d_model + # create two learnable parameters to calibrate normalisation + self.alpha = nn.Parameter(torch.ones(self.size)) + self.bias = nn.Parameter(torch.zeros(self.size)) + self.eps = eps + + def forward(self, x): + norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \ + / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias + return norm + +def attention(q, k, v, d_k, mask=None, dropout=None): + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) + + if mask is not None: + if len(mask.shape) == 2: + mask = mask.unsqueeze(1) + mask = mask.unsqueeze(3) + mask = mask.to(torch.float32) + mask2d = torch.matmul(mask, mask.transpose(-2, -1)).expand(scores.shape[0], scores.shape[1], scores.shape[2], scores.shape[3]) + elif len(mask.shape) == 3: + mask = mask.unsqueeze(1) + mask = mask.to(torch.float32) + mask2d = mask.expand(scores.shape[0], scores.shape[1], scores.shape[2], scores.shape[3]) + + + scores = scores.masked_fill(mask2d == 0, -1e9) + + scores = F.softmax(scores, dim=-1) + if dropout is not None: + scores = dropout(scores) + + output = torch.matmul(scores, v) + return output + + +def attention_score(q, k, v, d_k): + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) + scores = F.softmax(scores, dim=-1) + return scores + +class MultiHeadAttention(nn.Module): + def __init__(self, heads, d_model, dropout = 0.1): + super().__init__() + + self.d_model = d_model + self.d_k = d_model // heads + self.h = heads + + self.q_linear = nn.Linear(d_model, d_model) + self.v_linear = nn.Linear(d_model, d_model) + self.k_linear = nn.Linear(d_model, d_model) + + self.dropout = nn.Dropout(dropout) + self.out = nn.Linear(d_model, d_model) + + def attention_map(self, q, k, v, mask=None): + bs = q.size(0) + + # perform linear operation and split into N heads + k = self.k_linear(k).view(bs, -1, self.h, self.d_k) + q = self.q_linear(q).view(bs, -1, self.h, self.d_k) + v = self.v_linear(v).view(bs, -1, self.h, self.d_k) + + # transpose to get dimensions bs * N * sl * d_model + k = k.transpose(1,2) + q = q.transpose(1,2) + v = v.transpose(1,2) + + scores = attention_score(q, k, v, self.d_k) + + return scores + + + def forward(self, q, k, v, mask=None): + + bs = q.size(0) + + # perform linear operation and split into N heads + k = self.k_linear(k).view(bs, -1, self.h, self.d_k) + q = self.q_linear(q).view(bs, -1, self.h, self.d_k) + v = self.v_linear(v).view(bs, -1, self.h, self.d_k) + + # transpose to get dimensions bs * N * sl * d_model + k = k.transpose(1,2) + q = q.transpose(1,2) + v = v.transpose(1,2) + + # calculate attention using function we will define next + scores = attention(q, k, v, self.d_k, mask, self.dropout) + # concatenate heads and put through final linear layer + + concat = scores.transpose(1,2).contiguous()\ + .view(bs, -1, self.d_model) + output = self.out(concat) + + return output + +class FeedForward(nn.Module): + def __init__(self, d_model, d_ff=2048, dropout = 0.1): + super().__init__() + + # We set d_ff as a default to 2048 + self.linear_1 = nn.Linear(d_model, d_ff) + self.dropout = nn.Dropout(dropout) + self.linear_2 = nn.Linear(d_ff, d_model) + + def forward(self, x): + x = self.dropout(F.relu(self.linear_1(x))) + x = self.linear_2(x) + return x + +#Embedding + +class Embedder(nn.Module): + def __init__(self, vocab_size, d_model): + super().__init__() + self.d_model = d_model + self.embed = nn.Embedding(vocab_size, d_model) + def forward(self, x): + return self.embed(x) + +class PositionalEncoder(nn.Module): + def __init__(self, d_model, max_seq_len = 900, dropout = 0.1): + super().__init__() + self.d_model = d_model + self.dropout = nn.Dropout(dropout) + # create constant 'pe' matrix with values dependant on + # pos and i + pe = torch.zeros(max_seq_len, d_model) + for pos in range(max_seq_len): + for i in range(0, d_model, 2): + pe[pos, i] = \ + math.sin(pos / (10000 ** ((2 * i)/d_model))) + pe[pos, i + 1] = \ + math.cos(pos / (10000 ** ((2 * (i + 1))/d_model))) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + def forward(self, x): + # make embeddings relatively larger + x = x * math.sqrt(self.d_model) + #add constant to embedding + seq_len = x.size(1) + pe = Variable(self.pe[:,:seq_len], requires_grad=False) + if x.is_cuda: + pe.cuda() + x = x + pe + return self.dropout(x) + +class EncoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.attn = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + + def forward(self, x, mask = None, require_att = False): + x2 = self.norm_1(x) + xc = x2.clone() + + if mask is None: + x = x + self.dropout_1(self.attn(x2,x2,x2)) + else: + x = x + self.dropout_1(self.attn(x2,x2,x2, mask)) + + x2 = self.norm_2(x) + x = x + self.dropout_2(self.ff(x2)) + + if require_att : + att = self.attn.attention_map(xc, xc, xc) + return x, att + else : + return x + +# build a decoder layer with two multi-head attention layers and +# one feed-forward layer +class DecoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.norm_3 = Norm(d_model) + + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + self.dropout_3 = nn.Dropout(dropout) + + self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + + def forward(self, x, e_outputs, src_mask, trg_mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \ + src_mask)) + x2 = self.norm_3(x) + x = x + self.dropout_3(self.ff(x2)) + return x \ No newline at end of file diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/utils.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/utils.py new file mode 100644 index 00000000..df96a577 --- /dev/null +++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/utils.py @@ -0,0 +1,114 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import numpy +import torch.nn as nn + +def _sigmoid(x): + y = torch.clamp(x.sigmoid_(), min=1e-4, max=1-1e-4) + return y + +def _h_dist_feat(output, width): + feat = (output[:,:,0] + output[:,:,1])/(2*(width+1)) + return feat + +def _make_pair_feat(output): + if len(output.shape) == 2: + output = output.unsqueeze(2) + + output1 = output.unsqueeze(1).expand(output.size(0), output.size(1), output.size(1), output.size(2)) + output2 = output.unsqueeze(2).expand(output.size(0), output.size(1), output.size(1), output.size(2)) + output_paired = torch.cat((output1, output2), 3) + + return output_paired + +def _v_dist_feat(output, height): + feat = (output[:,:,2] + output[:,:,3])/(2*(height + 1)) + return feat + +def _gather_feat(feat, ind, mask=None): + dim = feat.size(2) + ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim) + feat = feat.gather(1, ind) + if mask is not None: + mask = mask.unsqueeze(2).expand_as(feat) + feat = feat[mask] + feat = feat.view(-1, dim) + return feat + +def _flatten_and_gather_feat(output, ind): + dim = output.size(3) + ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim) + output = output.contiguous().view(output.size(0), -1, output.size(3)) + output1 = output.gather(1, ind) + + return output1 + +def _get_4ps_feat(cc_match, output): + if isinstance(output, dict): + feat = output['cr'] + else : + feat = output + device = feat.device + feat = feat.permute(0, 2, 3, 1).contiguous() + feat = feat.contiguous().view(feat.size(0), -1, feat.size(3)) + feat = feat.unsqueeze(3).expand(feat.size(0), feat.size(1), feat.size(2), 4) + + dim = feat.size(2) + cc_match = cc_match.unsqueeze(2).expand(cc_match.size(0), cc_match.size(1), dim, cc_match.size(2)) + if not(isinstance(output, dict)): + cc_match = torch.where(cc_match=0, cc_match, torch.zeros(cc_match.shape).to(torch.int64).to(device)) + feat = feat.gather(1, cc_match) + return feat + +def _get_wh_feat(ind, output, ttype): + + width = output['hm'].shape[2] + xs = (ind % width).unsqueeze(2).int().float() + ys = (ind // width).unsqueeze(2).int().float() + if ttype == 'gt': + wh = output['wh'] + elif ttype == 'pred': + wh = _tranpose_and_gather_feat(output['wh'], ind) + ct = torch.cat([xs, ys, xs, ys, xs, ys, xs, ys], dim=2) + bbx = ct - wh + + return bbx + +def _normalized_ps(ps, vocab_size): + device = ps.device + ps = torch.round(ps).to(torch.int64) + ps = torch.where(ps < vocab_size, ps, (vocab_size-1) * torch.ones(ps.shape).to(torch.int64).to(device)) + ps = torch.where(ps >= 0, ps, torch.zeros(ps.shape).to(torch.int64).to(device)) + return ps + +def _tranpose_and_gather_feat(feat, ind): + feat = feat.permute(0, 2, 3, 1).contiguous() + feat = feat.view(feat.size(0), -1, feat.size(3)) + feat = _gather_feat(feat, ind) + return feat + +def flip_tensor(x): + return torch.flip(x, [3]) + +def flip_lr(x, flip_idx): + tmp = x.detach().cpu().numpy()[..., ::-1].copy() + shape = tmp.shape + for e in flip_idx: + tmp[:, e[0], ...], tmp[:, e[1], ...] = \ + tmp[:, e[1], ...].copy(), tmp[:, e[0], ...].copy() + return torch.from_numpy(tmp.reshape(shape)).to(x.device) + +def flip_lr_off(x, flip_idx): + tmp = x.detach().cpu().numpy()[..., ::-1].copy() + shape = tmp.shape + tmp = tmp.reshape(tmp.shape[0], 17, 2, + tmp.shape[2], tmp.shape[3]) + tmp[:, :, 0, :, :] *= -1 + for e in flip_idx: + tmp[:, e[0], ...], tmp[:, e[1], ...] = \ + tmp[:, e[1], ...].copy(), tmp[:, e[0], ...].copy() + return torch.from_numpy(tmp.reshape(shape)).to(x.device) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d41e1389..fa891564 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,4 +29,4 @@ faster_whisper==1.0.1 python-dotenv==1.0.1 duckduckgo-search==5.3.0b4 html2text==2024.2.26 -mistune-3.0.2 +mistune==3.0.2