diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/classifier.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/classifier.py
new file mode 100644
index 00000000..e307d9b7
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/classifier.py
@@ -0,0 +1,155 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import torch
+import torch.nn as nn
+from .utils import _tranpose_and_gather_feat, _get_wh_feat, _get_4ps_feat, _normalized_ps
+import torch.nn.functional as F
+
+import json
+import cv2
+import os
+from .transformer import Transformer
+import math
+import time
+import random
+import imgaug.augmenters as iaa
+import time 
+import copy
+
+class Stacker(nn.Module):
+    def __init__(self, input_size, hidden_size, output_size, layers, heads=8, dropout=0.1):
+        super(Stacker, self).__init__()
+        self.logi_encoder =  nn.Sequential(
+            nn.Linear(input_size, hidden_size),
+            nn.ReLU(inplace=True),
+            nn.Linear(hidden_size, hidden_size),
+            nn.ReLU(inplace=True) #newly added
+        )
+        self.tsfm = Transformer(2 * hidden_size, hidden_size, output_size, layers, heads, dropout)
+
+    def forward(self, outputs, logi, mask = None, require_att = False):
+      logi_embeddings = self.logi_encoder(logi)
+
+      cat_embeddings = torch.cat((logi_embeddings, outputs), dim=2)
+
+      if mask is None:
+        if require_att:
+          stacked_axis, att = self.tsfm(cat_embeddings)
+        else:
+          stacked_axis = self.tsfm(cat_embeddings)
+      else:
+        stacked_axis = self.tsfm(cat_embeddings, mask=mask)
+
+      if require_att:
+        return stacked_axis, att
+      else:
+        return stacked_axis
+
+class Processor(nn.Module):
+    def __init__(self, opt):
+        super(Processor, self).__init__()
+       
+        if opt.wiz_stacking:
+          self.stacker = Stacker(opt.output_size, opt.hidden_size, opt.output_size, opt.stacking_layers)
+
+        #input_state, hidden_state, output_state, layers, heads, dropout
+        self.tsfm_axis = Transformer(opt.input_size, opt.hidden_size, opt.output_size, opt.tsfm_layers, opt.num_heads, opt.att_dropout) #original version
+        self.x_position_embeddings = nn.Embedding(opt.max_fmp_size, opt.hidden_size)
+        self.y_position_embeddings = nn.Embedding(opt.max_fmp_size, opt.hidden_size)
+        
+        self.opt = opt
+    
+    def forward(self, outputs, dets = None, batch = None, cc_match = None): #training version forward
+      # 'outputs' stands for the feature of cells
+      # mask = None
+      # att = None
+
+      '''
+        Constructing Features:
+      '''
+      if batch is None:
+        # Inference Mode, the four corner features are gathered 
+        # during bounding boxes decoding for simplicity (See ctdet_4ps_decode() in ./src/lib/model/decode.py).
+        
+        vis_feat = outputs
+        if dets is None:
+          feat = vis_feat
+
+        else:
+          left_pe = self.x_position_embeddings(dets[:, :, 0])
+          upper_pe = self.y_position_embeddings(dets[:, :, 1])
+          right_pe = self.x_position_embeddings(dets[:, :, 2])
+          lower_pe = self.y_position_embeddings(dets[:, :, 5])
+          feat = vis_feat + left_pe + upper_pe + right_pe + lower_pe
+
+        # !TODO: moving the processings here and uniform the feature construction code for training and inference.
+      
+      else:
+        #Training Mode
+        ind = batch['hm_ind']
+        mask = batch['hm_mask'] #during training, the attention mask will be applied
+        output = outputs[-1]
+        pred = output['ax']
+        ct_feat = _tranpose_and_gather_feat(pred, ind)
+
+        if self.opt.wiz_2dpe:        
+          cr_feat = _get_4ps_feat(batch['cc_match'], output)
+          cr_feat = cr_feat.sum(axis = 3)
+          vis_feat = ct_feat + cr_feat
+          
+          ps = _get_wh_feat(ind, batch, 'gt')
+          ps = _normalized_ps(ps, self.opt.max_fmp_size)
+
+          left_pe = self.x_position_embeddings(ps[:, :, 0])
+          upper_pe = self.y_position_embeddings(ps[:, :, 1])
+          right_pe = self.x_position_embeddings(ps[:, :, 2])
+          lower_pe = self.y_position_embeddings(ps[:, :, 5])
+
+          feat = vis_feat + left_pe + upper_pe + right_pe + lower_pe
+
+        elif self.opt.wiz_4ps:
+          cr_feat = _get_4ps_feat(batch['cc_match'], output)
+          cr_feat = cr_feat.sum(axis = 3)
+          feat = ct_feat + cr_feat
+
+        elif self.opt.wiz_vanilla:
+          feat = ct_feat
+
+      '''
+        Put Features into TSFM:
+      '''
+
+      if batch is None:
+        #Inference Mode
+        logic_axis = self.tsfm_axis(feat) 
+        if self.opt.wiz_stacking:
+            stacked_axis = self.stacker(feat, logic_axis)
+      else:
+        #Training Mode
+        logic_axis = self.tsfm_axis(feat, mask = mask)   
+        if self.opt.wiz_stacking:
+          stacked_axis = self.stacker(feat, logic_axis, mask = mask)
+
+      if self.opt.wiz_stacking:
+        return logic_axis, stacked_axis
+      else:
+        return logic_axis 
+
+def load_processor(model, model_path, optimizer=None, resume=False, lr=None, lr_step=None):
+    checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
+    print('loaded {}, epoch {}'.format(model_path, checkpoint['epoch']))
+    state_dict = checkpoint['state_dict']
+    model.load_state_dict(state_dict)
+    return model
+
+def _judge(box):
+    countx = len(list(set([box[0],box[2],box[4],box[6]]))) 
+    county = len(list(set([box[1],box[3],box[5],box[7]]))) 
+    if countx<2 or county<2:
+        return False
+    
+    return True
+
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/data_parallel.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/data_parallel.py
new file mode 100644
index 00000000..1a96c0d2
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/data_parallel.py
@@ -0,0 +1,128 @@
+import torch
+from torch.nn.modules import Module
+from torch.nn.parallel.scatter_gather import gather
+from torch.nn.parallel.replicate import replicate
+from torch.nn.parallel.parallel_apply import parallel_apply
+
+
+from .scatter_gather import scatter_kwargs
+
+class _DataParallel(Module):
+    r"""Implements data parallelism at the module level.
+
+    This container parallelizes the application of the given module by
+    splitting the input across the specified devices by chunking in the batch
+    dimension. In the forward pass, the module is replicated on each device,
+    and each replica handles a portion of the input. During the backwards
+    pass, gradients from each replica are summed into the original module.
+
+    The batch size should be larger than the number of GPUs used. It should
+    also be an integer multiple of the number of GPUs so that each chunk is the
+    same size (so that each GPU processes the same number of samples).
+
+    See also: :ref:`cuda-nn-dataparallel-instead`
+
+    Arbitrary positional and keyword inputs are allowed to be passed into
+    DataParallel EXCEPT Tensors. All variables will be scattered on dim
+    specified (default 0). Primitive types will be broadcasted, but all
+    other types will be a shallow copy and can be corrupted if written to in
+    the model's forward pass.
+
+    Args:
+        module: module to be parallelized
+        device_ids: CUDA devices (default: all devices)
+        output_device: device location of output (default: device_ids[0])
+
+    Example::
+
+        >>> net = torch.nn.DataParallel(model, device_ids=[0, 1, 2])
+        >>> output = net(input_var)
+    """
+
+    # TODO: update notes/cuda.rst when this class handles 8+ GPUs well
+
+    def __init__(self, module, device_ids=None, output_device=None, dim=0, chunk_sizes=None):
+        super(_DataParallel, self).__init__()
+
+        if not torch.cuda.is_available():
+            self.module = module
+            self.device_ids = []
+            return
+
+        if device_ids is None:
+            device_ids = list(range(torch.cuda.device_count()))
+        if output_device is None:
+            output_device = device_ids[0]
+        self.dim = dim
+        self.module = module
+        self.device_ids = device_ids
+        self.chunk_sizes = chunk_sizes
+        self.output_device = output_device
+        if len(self.device_ids) == 1:
+            self.module.cuda(device_ids[0])
+
+    def forward(self, *inputs, **kwargs):
+        if not self.device_ids:
+            return self.module(*inputs, **kwargs)
+        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids, self.chunk_sizes)
+        if len(self.device_ids) == 1:
+            return self.module(*inputs[0], **kwargs[0])
+        replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
+        outputs = self.parallel_apply(replicas, inputs, kwargs)
+        return self.gather(outputs, self.output_device)
+
+    def replicate(self, module, device_ids):
+        return replicate(module, device_ids)
+
+    def scatter(self, inputs, kwargs, device_ids, chunk_sizes):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim, chunk_sizes=self.chunk_sizes)
+
+    def parallel_apply(self, replicas, inputs, kwargs):
+        return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
+
+    def gather(self, outputs, output_device):
+        return gather(outputs, output_device, dim=self.dim)
+
+
+def data_parallel(module, inputs, device_ids=None, output_device=None, dim=0, module_kwargs=None):
+    r"""Evaluates module(input) in parallel across the GPUs given in device_ids.
+
+    This is the functional version of the DataParallel module.
+
+    Args:
+        module: the module to evaluate in parallel
+        inputs: inputs to the module
+        device_ids: GPU ids on which to replicate module
+        output_device: GPU location of the output  Use -1 to indicate the CPU.
+            (default: device_ids[0])
+    Returns:
+        a Variable containing the result of module(input) located on
+        output_device
+    """
+    if not isinstance(inputs, tuple):
+        inputs = (inputs,)
+
+    if device_ids is None:
+        device_ids = list(range(torch.cuda.device_count()))
+
+    if output_device is None:
+        output_device = device_ids[0]
+
+    inputs, module_kwargs = scatter_kwargs(inputs, module_kwargs, device_ids, dim)
+    if len(device_ids) == 1:
+        return module(*inputs[0], **module_kwargs[0])
+    used_device_ids = device_ids[:len(inputs)]
+    replicas = replicate(module, used_device_ids)
+    outputs = parallel_apply(replicas, inputs, module_kwargs, used_device_ids)
+    return gather(outputs, output_device, dim)
+
+def DataParallel(module, device_ids=None, output_device=None, dim=0, chunk_sizes=None):
+    if chunk_sizes is None:
+        return torch.nn.DataParallel(module, device_ids, output_device, dim)
+    standard_size = True
+    for i in range(1, len(chunk_sizes)):
+        if chunk_sizes[i] != chunk_sizes[0]:
+            standard_size = False
+    if standard_size:
+        return torch.nn.DataParallel(module, device_ids, output_device, dim)
+    return _DataParallel(module, device_ids, output_device, dim, chunk_sizes)
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/decode.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/decode.py
new file mode 100644
index 00000000..e07da2fe
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/decode.py
@@ -0,0 +1,383 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import torch.nn as nn
+from .utils import _gather_feat, _tranpose_and_gather_feat, _get_4ps_feat
+import numpy as np 
+import shapely
+import time
+from shapely.geometry import Polygon, MultiPoint, Point
+
+def _nms(heat, name, kernel=3):
+    pad = (kernel - 1) // 2
+
+    hmax = nn.functional.max_pool2d(
+        heat, (kernel, kernel), stride=1, padding=pad)
+    #save_map(hmax.cpu().numpy()[0],name)
+    keep = (hmax == heat).float()
+    return heat * keep,keep
+
+def _topk_channel(scores, K=40):
+     
+      batch, cat, height, width = scores.size()
+      
+      topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K)
+
+      topk_inds = topk_inds % (height * width)
+      topk_ys   = (topk_inds / width).int().float()
+      topk_xs   = (topk_inds % width).int().float()
+
+      return topk_scores, topk_inds, topk_ys, topk_xs
+
+def _topk(scores, K=40, device=None):
+    #import ipdb
+    #ipdb.set_trace()
+    batch, cat, height, width = scores.size()
+      
+    topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K)
+
+    topk_inds = topk_inds % (torch.Tensor([height]).to(torch.int64).to(device) * torch.Tensor([width]).to(torch.int64).to(device))
+    topk_ys   = (topk_inds / torch.Tensor([width]).to(device)).int().float()
+    topk_xs   = (topk_inds % torch.Tensor([width]).to(torch.int64).to(device)).int().float()
+      
+    topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K)
+    topk_clses = (topk_ind // K).int()
+    topk_inds = _gather_feat(
+        topk_inds.view(batch, -1, 1), topk_ind).view(batch, K)
+    topk_ys = _gather_feat(topk_ys.view(batch, -1, 1), topk_ind).view(batch, K)
+    topk_xs = _gather_feat(topk_xs.view(batch, -1, 1), topk_ind).view(batch, K)
+
+    return topk_score, topk_inds, topk_clses, topk_ys, topk_xs
+
+def corner_decode(mk, st_reg, mk_reg=None, K=400,device=None):
+    batch, cat, height, width = mk.size()
+    mk,keep = _nms(mk,'mk.0.maxpool')
+    scores, inds, clses, ys, xs = _topk(mk, K=K, device=device)
+    if mk_reg is not None:
+      reg = _tranpose_and_gather_feat(mk_reg, inds)
+      reg = reg.view(batch, K, 2)
+      xs = xs.view(batch, K, 1) + reg[:, :, 0:1]
+      ys = ys.view(batch, K, 1) + reg[:, :, 1:2]
+    else:
+      xs = xs.view(batch, K, 1) + 0.5
+      ys = ys.view(batch, K, 1) + 0.5
+    scores = scores.view(batch, K, 1)
+    st_Reg = _tranpose_and_gather_feat(st_reg, inds)
+    bboxes = torch.cat([xs - st_Reg[..., 0:1], 
+                        ys - st_Reg[..., 1:2],
+                        xs - st_Reg[..., 2:3], 
+                        ys - st_Reg[..., 3:4],
+                        xs - st_Reg[..., 4:5],
+                        ys - st_Reg[..., 5:6],
+                        xs - st_Reg[..., 6:7],
+                        ys - st_Reg[..., 7:8]], dim=2)
+    corner_dict = {'scores': scores, 'inds': inds, 'ys': ys, 'xs': xs, 'gboxes': bboxes}
+    return scores, inds, ys, xs, bboxes, corner_dict
+
+def ctdet_4ps_decode(heat, wh, ax, cr, corner_dict=None, reg=None, cat_spec_wh=False, K=100, wiz_rev = False):
+    
+    # if wiz_rev :
+    #     print('Grouping and Parsing ...')
+    batch, cat, height, width = heat.size()
+    device = heat.device
+    # heat = torch.sigmoid(heat)
+    # perform nms on heatmaps
+    heat,keep = _nms(heat,'hm.0.maxpool')
+  
+    scores, inds, clses, ys, xs = _topk(heat, K=K,device=device)
+    if reg is not None:
+      reg = _tranpose_and_gather_feat(reg, inds)
+      reg = reg.view(batch, K, 2)
+      xs = xs.view(batch, K, 1) + reg[:, :, 0:1]
+      ys = ys.view(batch, K, 1) + reg[:, :, 1:2]
+    else:
+      xs = xs.view(batch, K, 1) + 0.5
+      ys = ys.view(batch, K, 1) + 0.5
+    wh = _tranpose_and_gather_feat(wh, inds)
+    ax = _tranpose_and_gather_feat(ax, inds)
+    
+    if cat_spec_wh:
+      wh = wh.view(batch, K, cat, 8)
+      clses_ind = clses.view(batch, K, 1, 1).expand(batch, K, 1, 8).long()
+      wh = wh.gather(2, clses_ind).view(batch, K, 8)
+    else:
+      wh = wh.view(batch, K, 8)
+    clses  = clses.view(batch, K, 1).float()
+    scores = scores.view(batch, K, 1)
+
+    '''
+    bboxes = torch.cat([xs - wh[..., 0:1], 
+                        ys - wh[..., 1:2],
+                        xs + wh[..., 2:3], 
+                        ys - wh[..., 3:4],
+                        xs + wh[..., 4:5],
+                        ys + wh[..., 5:6],
+                        xs - wh[..., 6:7],
+                        ys + wh[..., 7:8]], dim=2)
+    '''
+
+    bboxes = torch.cat([xs - wh[..., 0:1], 
+                        ys - wh[..., 1:2],
+                        xs - wh[..., 2:3], 
+                        ys - wh[..., 3:4],
+                        xs - wh[..., 4:5],
+                        ys - wh[..., 5:6],
+                        xs - wh[..., 6:7],
+                        ys - wh[..., 7:8]], dim=2)
+
+    rev_time_s1 = time.time()
+    if wiz_rev :
+        bboxes_rev = bboxes.clone()
+        bboxes_cpu = bboxes.clone().cpu()
+
+        gboxes = corner_dict['gboxes']
+        gboxes_cpu = gboxes.cpu()
+
+        num_bboxes = bboxes.shape[1]
+        num_gboxes = gboxes.shape[1]
+
+        corner_xs = corner_dict['xs'] 
+        corner_ys = corner_dict['ys'] 
+        corner_scores = corner_dict['scores'] 
+        
+       
+        for i in range(num_bboxes):
+            if scores[0,i,0] >= 0.2 :
+                count = 0 # counting the number of ends of st head in bbox i
+                for j in range(num_gboxes):
+                    if corner_scores[0,j,0] >= 0.3:
+                        #here comes to one pair of valid bbox and gbox
+                        #step1 is there an overlap
+                      
+                        bbox = bboxes_cpu[0,i,:]
+                        gbox = gboxes_cpu[0,j,:]
+                        #rev_time_s3 = time.time()
+                        if is_group_faster_faster(bbox, gbox): 
+                            #step2 find which corner point to refine, and do refine
+                            cr_x = corner_xs[0,j,0]
+                            cr_y = corner_ys[0,j,0]
+                        
+                            ind4ps = find4ps(bbox, cr_x, cr_y, device)
+                            if bboxes_rev[0, i, 2*ind4ps] == bboxes[0, i, 2*ind4ps] and bboxes_rev[0, i, 2*ind4ps+1] == bboxes[0, i, 2*ind4ps+1]:
+                                #first_shift
+                                count = count + 1
+                                bboxes_rev[0, i, 2*ind4ps] = cr_x
+                                bboxes_rev[0, i, 2*ind4ps + 1] = cr_y
+                            else:
+                                origin_x = bboxes[0, i, 2*ind4ps]
+                                origin_y = bboxes[0, i, 2*ind4ps+1]
+
+                                old_x = bboxes_rev[0, i, 2*ind4ps]
+                                old_y = bboxes_rev[0, i, 2*ind4ps+1]
+
+                                if dist(origin_x, origin_y, old_x, old_y) >= dist(origin_x, origin_y, cr_x, cr_y):
+                                    count = count + 1
+                                    bboxes_rev[0, i, 2*ind4ps] = cr_x
+                                    bboxes_rev[0, i, 2*ind4ps + 1] = cr_y
+                                else:
+                                    continue
+                        else:
+                           
+                            continue
+                    else:
+                        break        
+                if count <= 2:
+                    scores[0,i,0] = scores[0,i,0] * 0.4
+            else :
+                break
+
+    if wiz_rev:
+
+        cc_match = torch.cat([(bboxes_rev[:,:,0:1]) + width * torch.round(bboxes_rev[:,:,1:2]),
+                            (bboxes_rev[:,:,2:3]) + width * torch.round(bboxes_rev[:,:,3:4]),
+                            (bboxes_rev[:,:,4:5]) + width * torch.round(bboxes_rev[:,:,5:6]),
+                            (bboxes_rev[:,:,6:7]) + width * torch.round(bboxes_rev[:,:,7:8])], dim=2)
+    
+    else:    
+        cc_match = torch.cat([(xs - wh[..., 0:1]) + width * torch.round(ys - wh[..., 1:2]),
+                            (xs - wh[..., 2:3]) + width * torch.round(ys - wh[..., 3:4]),
+                            (xs - wh[..., 4:5]) + width * torch.round(ys - wh[..., 5:6]),
+                            (xs - wh[..., 6:7]) + width * torch.round(ys - wh[..., 7:8])], dim=2)
+    
+    cc_match = torch.round(cc_match).to(torch.int64)
+
+    cr_feat = _get_4ps_feat(cc_match, cr)
+    cr_feat = cr_feat.sum(axis = 3)
+    if wiz_rev:
+        detections = torch.cat([bboxes_rev, scores, clses], dim=2)
+        _, sorted_ind = torch.sort(scores, descending=True, dim=1)
+        sorted_inds = sorted_ind.expand(detections.size(0), detections.size(1), detections.size(2))
+        detections = detections.gather(1, sorted_inds)
+        sorted_inds2 = sorted_ind.expand(detections.size(0), detections.size(1), ax.size(2))
+        ax =  ax.gather(1, sorted_inds2)
+    else:
+       
+        detections = torch.cat([bboxes, scores, clses], dim=2)
+
+    return detections, keep, ax, cr_feat
+
+def wireless_decode(heat, wh, ax, cr, reg=None, cat_spec_wh=False, K=100):
+    
+    batch, cat, height, width = heat.size()
+    # heat = torch.sigmoid(heat)
+    # perform nms on heatmaps
+    heat,keep = _nms(heat,'hm.0.maxpool')
+  
+    scores, inds, clses, ys, xs = _topk(heat, K=K)
+    if reg is not None:
+      reg = _tranpose_and_gather_feat(reg, inds)
+      reg = reg.view(batch, K, 2)
+      xs = xs.view(batch, K, 1) + reg[:, :, 0:1]
+      ys = ys.view(batch, K, 1) + reg[:, :, 1:2]
+    else:
+      xs = xs.view(batch, K, 1) + 0.5
+      ys = ys.view(batch, K, 1) + 0.5
+    wh = _tranpose_and_gather_feat(wh, inds)
+    ax = _tranpose_and_gather_feat(ax, inds)
+    
+    if cat_spec_wh:
+      wh = wh.view(batch, K, cat, 8)
+      clses_ind = clses.view(batch, K, 1, 1).expand(batch, K, 1, 8).long()
+      wh = wh.gather(2, clses_ind).view(batch, K, 8)
+    else:
+      wh = wh.view(batch, K, 8)
+    clses  = clses.view(batch, K, 1).float()
+    scores = scores.view(batch, K, 1)
+    '''
+    bboxes = torch.cat([xs - wh[..., 0:1], 
+                        ys - wh[..., 1:2],
+                        xs + wh[..., 2:3], 
+                        ys - wh[..., 3:4],
+                        xs + wh[..., 4:5],
+                        ys + wh[..., 5:6],
+                        xs - wh[..., 6:7],
+                        ys + wh[..., 7:8]], dim=2)
+    '''
+    bboxes = torch.cat([xs - wh[..., 0:1], 
+                        ys - wh[..., 1:2],
+                        xs - wh[..., 2:3], 
+                        ys - wh[..., 3:4],
+                        xs - wh[..., 4:5],
+                        ys - wh[..., 5:6],
+                        xs - wh[..., 6:7],
+                        ys - wh[..., 7:8]], dim=2)
+  
+    cc_match = torch.cat([(xs - wh[..., 0:1]) + width * torch.round(ys - wh[..., 1:2]),
+                        (xs - wh[..., 2:3]) + width * torch.round(ys - wh[..., 3:4]),
+                        (xs - wh[..., 4:5]) + width * torch.round(ys - wh[..., 5:6]),
+                        (xs - wh[..., 6:7]) + width * torch.round(ys - wh[..., 7:8])], dim=2)
+
+    cc_match = torch.round(cc_match).to(torch.int64)
+
+    cr_feat = _get_4ps_feat(cc_match, cr)
+    cr_feat = cr_feat.sum(axis = 3)
+    
+    detections = torch.cat([bboxes, scores, clses], dim=2)
+    return detections, keep, ax, cr_feat
+
+def find4ps(bbox, x, y,device):
+    xs = torch.Tensor([bbox[0],bbox[2],bbox[4],bbox[6]]).to(device)
+    ys = torch.Tensor([bbox[1],bbox[3],bbox[5],bbox[7]]).to(device)
+
+    dx = xs - x
+    dy = ys - y
+
+    l = dx**2 + dy**2
+    return torch.argmin(l)
+
+def dist(x1, y1, x2, y2):
+    dx = x1 - x2
+    dy = y1 - y2
+    l = dx**2 + dy**2
+    return l
+
+def rect_inter(b1_x1, b1_y1, b1_x2, b1_y2, b2_x1, b2_y1, b2_x2, b2_y2):
+    if (b1_x1 <= b2_x1  and b2_x1 <= b1_x2) or (b1_x1 <= b2_x2  and b2_x2 <= b1_x2):
+        if (b1_y1 <= b2_y1  and b2_y1 <= b1_y2) or (b1_y1 <= b2_y2  and b2_y2 <= b1_y2):
+            return True
+        else:
+            return False
+    else:
+        return False
+
+def is_group_faster_faster(bbox, gbox):
+    bbox = bbox.view(4,2)
+    gbox = gbox.view(4,2)
+  
+    bbox_xmin, bbox_xmax, bbox_ymin, bbox_ymax = bbox[:,0].min(), bbox[:,0].max(), bbox[:,1].min(), bbox[:,1].max()#min(bbox_xs), max(bbox_xs), min(bbox_ys), max(bbox_ys)
+    gbox_xmin, gbox_xmax, gbox_ymin, gbox_ymax = gbox[:,0].min(), gbox[:,0].max(), gbox[:,1].min(), gbox[:,1].max()
+
+    if bbox_xmin > gbox_xmax or gbox_xmin > bbox_xmax or bbox_ymin > gbox_ymax or gbox_ymin > bbox_ymax:
+        return False
+    else:
+        bpoly = Polygon(bbox)
+
+        flag = 0
+        for i in range(4):
+            p = Point(gbox[i])
+            if p.within(bpoly):
+                flag = 1
+                break
+        if flag == 0:
+            return False
+        else :
+            return True
+
+def ctdet_st_decode(heat, st, reg=None, cat_spec_wh=False, K=100):
+    batch, cat, height, width = heat.size()
+    heat,keep = _nms(heat,'hm.0.maxpool')
+      
+    scores, inds, clses, ys, xs = _topk(heat, K=K)
+    if reg is not None:
+      reg = _tranpose_and_gather_feat(reg, inds)
+      reg = reg.view(batch, K, 2)
+      xs = xs.view(batch, K, 1) + reg[:, :, 0:1]
+      ys = ys.view(batch, K, 1) + reg[:, :, 1:2]
+    else:
+      xs = xs.view(batch, K, 1) + 0.5
+      ys = ys.view(batch, K, 1) + 0.5
+    st = _tranpose_and_gather_feat(st, inds)
+    if cat_spec_wh:
+      st = st.view(batch, K, cat, 4)
+      clses_ind = clses.view(batch, K, 1, 1).expand(batch, K, 1, 4).long()
+      st = st.gather(2, clses_ind).view(batch, K, 4)
+    else:
+      st = st.view(batch, K, 4)
+      
+    return st
+
+def ctdet_decode(heat, wh, reg=None, cat_spec_wh=False, K=100):
+    batch, cat, height, width = heat.size()
+
+    # heat = torch.sigmoid(heat)
+    # perform nms on heatmaps
+    heat = _nms(heat)
+      
+    scores, inds, clses, ys, xs = _topk(heat, K=K)
+    if reg is not None:
+      reg = _tranpose_and_gather_feat(reg, inds)
+      reg = reg.view(batch, K, 2)
+      xs = xs.view(batch, K, 1) + reg[:, :, 0:1]
+      ys = ys.view(batch, K, 1) + reg[:, :, 1:2]
+    else:
+      xs = xs.view(batch, K, 1) + 0.5
+      ys = ys.view(batch, K, 1) + 0.5
+    wh = _tranpose_and_gather_feat(wh, inds)
+    if cat_spec_wh:
+      wh = wh.view(batch, K, cat, 2)
+      clses_ind = clses.view(batch, K, 1, 1).expand(batch, K, 1, 2).long()
+      wh = wh.gather(2, clses_ind).view(batch, K, 2)
+    else:
+      wh = wh.view(batch, K, 2)
+    clses  = clses.view(batch, K, 1).float()
+    scores = scores.view(batch, K, 1)
+    bboxes = torch.cat([xs - wh[..., 0:1] / 2, 
+                        ys - wh[..., 1:2] / 2,
+                        xs + wh[..., 0:1] / 2,
+                        ys + wh[..., 1:2] / 2], dim=2)
+    detections = torch.cat([bboxes, scores, clses], dim=2)
+      
+    return detections
+
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/losses.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/losses.py
new file mode 100644
index 00000000..ca0e35c5
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/losses.py
@@ -0,0 +1,245 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import torch.nn as nn
+from .utils import _flatten_and_gather_feat, _tranpose_and_gather_feat
+import torch.nn.functional as F
+
+def _neg_loss(pred, gt):
+  ''' Modified focal loss. Exactly the same as CornerNet.
+      Runs faster and costs a little bit more memory
+    Arguments:
+      pred (batch x c x h x w)
+      gt_regr (batch x c x h x w)
+  '''
+  pos_inds = gt.eq(1).float()
+  neg_inds = gt.lt(1).float()
+
+  neg_weights = torch.pow(1 - gt, 4)
+
+  loss = 0
+
+  pos_loss = torch.log(pred) * torch.pow(1 - pred, 2) * pos_inds
+  neg_loss = torch.log(1 - pred) * torch.pow(pred, 2) * neg_weights * neg_inds
+
+  num_pos  = pos_inds.float().sum()
+  pos_loss = pos_loss.sum()
+  neg_loss = neg_loss.sum()
+
+  if num_pos == 0:
+    loss = loss - neg_loss
+  else:
+    loss = loss - (pos_loss + neg_loss) / num_pos
+  return loss
+
+def _reg_loss(regr, gt_regr, mask):
+  ''' L1 regression loss
+    Arguments:
+      regr (batch x max_objects x dim)
+      gt_regr (batch x max_objects x dim)
+      mask (batch x max_objects)
+  '''
+  num = mask.float().sum()
+  mask = mask.unsqueeze(2).expand_as(gt_regr).float()
+
+  regr = regr * mask
+  gt_regr = gt_regr * mask
+    
+  regr_loss = nn.functional.smooth_l1_loss(regr, gt_regr, size_average=False)
+  regr_loss = regr_loss / (num + 1e-4)
+  return regr_loss
+
+class AxisLoss(nn.Module):
+  def __init__(self):
+    super(AxisLoss, self).__init__()
+  
+  def forward(self, output, mask, ind, target, logi=None):
+    span_type = False
+    #computing vanilla axis loss
+    if logi is None:
+      pred = _tranpose_and_gather_feat(output, ind)
+    else:
+      pred = logi
+
+    mask = mask.unsqueeze(2).float()
+    loss = F.l1_loss(pred * mask, target * mask, size_average=False)
+    loss = loss / (4*(mask.sum() + 1e-4))
+ 
+    return loss
+
+class FocalLoss(nn.Module):
+  '''nn.Module warpper for focal loss'''
+  def __init__(self):
+    super(FocalLoss, self).__init__()
+    self.neg_loss = _neg_loss
+
+  def forward(self, out, target):
+    return self.neg_loss(out, target)
+
+class RegLoss(nn.Module):
+  '''Regression loss for an output tensor
+    Arguments:
+      output (batch x dim x h x w)
+      mask (batch x max_objects)
+      ind (batch x max_objects)
+      target (batch x max_objects x dim)
+  '''
+  def __init__(self):
+    super(RegLoss, self).__init__()
+  
+  def forward(self, output, mask, ind, target):
+    pred = _tranpose_and_gather_feat(output, ind)
+    loss = _reg_loss(pred, target, mask)
+    return loss
+
+class RegL1Loss(nn.Module):
+  def __init__(self):
+    super(RegL1Loss, self).__init__()
+  
+  def forward(self, output, mask, ind, target):
+    pred = _tranpose_and_gather_feat(output, ind)
+    mask = mask.unsqueeze(2).expand_as(pred).float()
+    # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean')
+    loss = F.l1_loss(pred * mask, target * mask, size_average=False)
+    loss = loss / (mask.sum() + 1e-4)
+    return loss
+
+class PairLoss(nn.Module):
+  def __init__(self):
+    super(PairLoss, self).__init__()
+  
+  def forward(self, output1, ind1, output2, ind2, mask, mask_cro, ctr_cro_ind, target1, target2, hm_ctxy):
+ 
+    pred1 = _tranpose_and_gather_feat(output1, ind1) #bxmx8
+    pred2 = _tranpose_and_gather_feat(output2, ind2) #bxnx8
+    pred2_tmp = pred2
+    target2_tmp = target2
+    mask = mask.unsqueeze(2).expand_as(pred1).float()
+
+    b = pred1.size(0)
+    m = pred1.size(1)
+    n = pred2.size(1)
+    pred2 = pred2.view(b,4*n,2)
+    ctr_cro_ind = ctr_cro_ind.unsqueeze(2).expand(b,4*m,2)
+    pred2 = pred2.gather(1,ctr_cro_ind).view(b,m,8) #bxmx8
+    target2 = target2.view(b,4*n,2).gather(1,ctr_cro_ind).view(b,m,8)
+
+    delta = (torch.abs(pred1-target1)+torch.abs(pred2-target2)) / (torch.abs(target1) + 1e-4)
+    delta = delta * delta
+    delta_mask = (~delta.gt(1.0))*1#1 - delta.gt(1.0)
+    delta = delta*(delta_mask.float())+(1-delta_mask).float()
+    delta = (-3.14)*delta
+    weight = 1 - torch.exp(delta)
+
+    loss1 = F.l1_loss(pred1 * mask * weight, target1 * mask * weight, size_average=False)
+    loss2 = F.l1_loss(pred2 * mask * weight, target2 * mask * weight, size_average=False)
+    loss1 = loss1 / (mask.sum() + 1e-4)
+    loss2 = loss2 / (mask.sum() + 1e-4) 
+
+    mask1 = (target2_tmp==0)
+    mask_cro = mask_cro.unsqueeze(2).expand(b,n,8)
+    MASK = (mask1==mask_cro).float() 
+    loss3 = F.l1_loss(pred2_tmp * MASK, target2_tmp * MASK, size_average=False)
+    loss3 = loss3 / (mask.sum() + 1e-4)
+
+    return loss1, 0.5 * loss2 + 0.2 * loss3
+    
+class NormRegL1Loss(nn.Module):
+  def __init__(self):
+    super(NormRegL1Loss, self).__init__()
+  
+  def forward(self, output, mask, ind, target):
+    pred = _tranpose_and_gather_feat(output, ind)
+    mask = mask.unsqueeze(2).expand_as(pred).float()
+    # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean')
+    pred = pred / (target + 1e-4)
+    target = target * 0 + 1
+    loss = F.l1_loss(pred * mask, target * mask, size_average=False)
+    loss = loss / (mask.sum() + 1e-4)
+    return loss
+
+class RegWeightedL1Loss(nn.Module):
+  def __init__(self):
+    super(RegWeightedL1Loss, self).__init__()
+  
+  def forward(self, output, mask, ind, target):
+    pred = _tranpose_and_gather_feat(output, ind)
+    mask = mask.float()
+    # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean')
+    loss = F.l1_loss(pred * mask, target * mask, size_average=False)
+    loss = loss / (mask.sum() + 1e-4)
+    return loss
+
+class L1Loss(nn.Module):
+  def __init__(self):
+    super(L1Loss, self).__init__()
+  
+  def forward(self, output, mask, ind, target):
+    pred = _tranpose_and_gather_feat(output, ind)
+    mask = mask.unsqueeze(2).expand_as(pred).float()
+    loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean')
+    return loss
+
+
+def compute_res_loss(output, target):
+    return F.smooth_l1_loss(output, target, reduction='elementwise_mean')
+
+
+def _axis_eval(output, mask, ind, target, logi=None, mode = None):
+  if logi is None:
+    pred = _tranpose_and_gather_feat(output, ind)
+  else:
+    pred = logi
+  
+  dev = pred - target
+  dev = torch.abs(dev)
+  
+  one = torch.ones(dev.shape).cuda()
+  zero = torch.zeros(dev.shape).cuda()
+ 
+  true_vec = torch.where(dev < 0.5, one, zero)
+  #true = torch.sum((true_vec * mask.unsqueeze(2)) == 1)
+  true = torch.sum(true_vec.sum(axis=2)*mask == 4)
+  total = (mask == 1).sum()
+  true = true.to(torch.float32)
+  total = total.to(torch.float32)
+
+  acc = true/total
+
+  if acc is None:
+    acc = 0
+
+  if mode == 'full':
+    pred_int = process_logi(pred)
+
+    pred_pair = _make_pair_feat(pred_int)
+    target_pair = _make_pair_feat(target)
+    mask_pair = _make_pair_feat(ind)
+
+    #pred_int = pred_int.expand()
+
+    mask_1 = mask_pair[:,:,:,0]
+    mask_2 = mask_pair[:,:,:,1]
+
+    at_vec_h = (target_pair[:,:,:,2]<=target_pair[:,:,:,6]) & (target_pair[:,:,:,6]<=target_pair[:,:,:,3]) & (mask_1 != 0) & (mask_2 != 0)
+    at_vec_w = (target_pair[:,:,:,0]<=target_pair[:,:,:,4]) & (target_pair[:,:,:,4]<=target_pair[:,:,:,1]) & (mask_1 != 0) & (mask_2 != 0)
+
+    ap_vec_h = (pred_pair[:,:,:,2]<=pred_pair[:,:,:,6]) & (pred_pair[:,:,:,6]<=pred_pair[:,:,:,3]) & (mask_1 != 0) & (mask_2 != 0)
+    ap_vec_w = (pred_pair[:,:,:,0]<=pred_pair[:,:,:,4]) & (pred_pair[:,:,:,4]<=pred_pair[:,:,:,1]) & (mask_1 != 0) & (mask_2 != 0)
+
+    tp_h = at_vec_h & ap_vec_h
+    tp_w = at_vec_w & ap_vec_w
+
+    ap = torch.sum(ap_vec_h) + torch.sum(ap_vec_w)     
+    at = torch.sum(at_vec_h) + torch.sum(at_vec_w)   
+    tp = torch.sum(tp_h) + torch.sum(tp_w)
+    
+    pre = tp/(ap + 1e-4)
+    rec = tp/(at + 1e-4)
+
+    return acc , pre, rec
+  else:
+    return acc
+
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/model.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/model.py
new file mode 100644
index 00000000..dfef0dd1
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/model.py
@@ -0,0 +1,151 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torchvision.models as models
+import torch
+import torch.nn as nn
+import os
+
+from .networks.fpn_resnet import get_pose_net_fpn
+from .networks.fpn_resnet_half import get_pose_net_fpn_half
+
+from .networks.fpn_mask_resnet import get_pose_net_fpn_mask
+from .networks.fpn_mask_resnet_half import get_pose_net_fpn_mask_half
+
+from .networks.pose_dla_dcn import get_pose_net as get_dla_dcn 
+
+
+_model_factory = {
+  'dla': get_dla_dcn, 
+  'resfpn':get_pose_net_fpn,
+  'resfpnhalf': get_pose_net_fpn_half,
+  'resfpnmask':get_pose_net_fpn_mask,
+  'resfpnmaskhalf':get_pose_net_fpn_mask_half
+}
+
+def create_model(arch, heads, head_conv):
+  num_layers = int(arch[arch.find('_') + 1:]) if '_' in arch else 0
+  arch = arch[:arch.find('_')] if '_' in arch else arch
+  get_model = _model_factory[arch]
+  model = get_model(num_layers=num_layers, heads=heads, head_conv=head_conv)
+  return model
+
+def load_model(model, model_path, optimizer=None, resume=False, 
+               lr=None, lr_step=None):
+  start_epoch = 0
+  
+  checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
+  
+  print('loaded {}, epoch {}'.format(model_path, checkpoint['epoch']))
+  state_dict_ = checkpoint['state_dict']
+  state_dict = {}
+  
+  # convert data_parallal to model
+  for k in state_dict_:
+    if k.startswith('module') and not k.startswith('module_list'):
+      state_dict[k[7:]] = state_dict_[k]
+    else:
+      state_dict[k] = state_dict_[k]
+  print(model, file=open('model.txt', 'a'))
+  model_state_dict = model.state_dict()
+
+  # check loaded parameters and created model parameters
+  for k in state_dict:
+    if k in model_state_dict:
+      if state_dict[k].shape != model_state_dict[k].shape:
+        print('Skip loading parameter {}, required shape{}, '\
+              'loaded shape{}.'.format(
+          k, model_state_dict[k].shape, state_dict[k].shape))
+        state_dict[k] = model_state_dict[k]
+    else:
+      print('Drop parameter {}.'.format(k))
+  for k in model_state_dict:
+    if not (k in state_dict):
+      print('No param {}.'.format(k))
+      state_dict[k] = model_state_dict[k]
+  model.load_state_dict(state_dict, strict=False)
+
+  # resume optimizer parameters
+  if optimizer is not None and resume:
+    if 'optimizer' in checkpoint:
+      optimizer.load_state_dict(checkpoint['optimizer'])
+      start_epoch = checkpoint['epoch']
+      start_lr = lr
+      for step in lr_step:
+        if start_epoch >= step:
+          start_lr *= 0.1
+      for param_group in optimizer.param_groups:
+        param_group['lr'] = start_lr
+      print('Resumed optimizer with start lr', start_lr)
+    else:
+      print('No optimizer parameters in checkpoint.')
+  if optimizer is not None:
+    return model, optimizer, start_epoch
+  else:
+    return model
+
+def load_multiple(model, model_path, optimizer=None, resume=False, 
+               lr=None, lr_step=None):
+  start_epoch = 0
+  
+  checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
+  
+  print('loaded {}, epoch {}'.format(model_path, checkpoint['epoch']))
+  state_dict_ = checkpoint['state_dict']
+  state_dict = {}
+  
+  # convert data_parallal to model
+  for k in state_dict_:
+    if k.startswith('module') and not k.startswith('module_list'):
+      state_dict[k[7:]] = state_dict_[k]
+    else:
+      state_dict[k] = state_dict_[k]
+  model_state_dict = model.state_dict()
+
+  # check loaded parameters and created model parameters
+  for k in state_dict:
+    if k in model_state_dict:
+      if state_dict[k].shape != model_state_dict[k].shape:
+        print('Skip loading parameter {}, required shape{}, '\
+              'loaded shape{}.'.format(
+          k, model_state_dict[k].shape, state_dict[k].shape))
+        state_dict[k] = model_state_dict[k]
+    else:
+      print('Drop parameter {}.'.format(k))
+  for k in model_state_dict:
+    if not (k in state_dict):
+      print('No param {}.'.format(k))
+      state_dict[k] = model_state_dict[k]
+  model.load_state_dict(state_dict, strict=False)
+
+  # resume optimizer parameters
+  if optimizer is not None and resume:
+    if 'optimizer' in checkpoint:
+      optimizer.load_state_dict(checkpoint['optimizer'])
+      start_epoch = checkpoint['epoch']
+      start_lr = lr
+      for step in lr_step:
+        if start_epoch >= step:
+          start_lr *= 0.1
+      for param_group in optimizer.param_groups:
+        param_group['lr'] = start_lr
+      print('Resumed optimizer with start lr', start_lr)
+    else:
+      print('No optimizer parameters in checkpoint.')
+  if optimizer is not None:
+    return model, optimizer, start_epoch
+  else:
+    return model
+
+def save_model(path, epoch, model, optimizer=None):
+  if isinstance(model, torch.nn.DataParallel):
+    state_dict = model.module.state_dict()
+  else:
+    state_dict = model.state_dict()
+  data = {'epoch': epoch,
+          'state_dict': state_dict}
+  if not (optimizer is None):
+    data['optimizer'] = optimizer.state_dict()
+  torch.save(data, path)
+
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/PKG-INFO b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/PKG-INFO
new file mode 100644
index 00000000..0e4d91d8
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/PKG-INFO
@@ -0,0 +1,7 @@
+Metadata-Version: 2.1
+Name: DCNv2
+Version: 0.1
+Summary: deformable convolutional networks
+Home-page: https://github.com/charlesshang/DCNv2
+Author: charlesshang
+License-File: LICENSE
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/SOURCES.txt b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/SOURCES.txt
new file mode 100644
index 00000000..f4eeb2b5
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/SOURCES.txt
@@ -0,0 +1,13 @@
+LICENSE
+setup.py
+/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.cpp
+/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.cpp
+/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.cpp
+/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.cpp
+/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.cu
+/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.cu
+/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.cu
+DCNv2.egg-info/PKG-INFO
+DCNv2.egg-info/SOURCES.txt
+DCNv2.egg-info/dependency_links.txt
+DCNv2.egg-info/top_level.txt
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/dependency_links.txt b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/dependency_links.txt
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/top_level.txt b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/top_level.txt
new file mode 100644
index 00000000..8eafb8a8
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/DCNv2.egg-info/top_level.txt
@@ -0,0 +1 @@
+_ext
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/LICENSE b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/LICENSE
new file mode 100644
index 00000000..b2e3b520
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2019, Charles Shang
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/__init__.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/_ext.cpython-37m-x86_64-linux-gnu.so b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/_ext.cpython-37m-x86_64-linux-gnu.so
new file mode 100755
index 00000000..33ff3ea9
Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/_ext.cpython-37m-x86_64-linux-gnu.so differ
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/lib.linux-x86_64-cpython-37/_ext.cpython-37m-x86_64-linux-gnu.so b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/lib.linux-x86_64-cpython-37/_ext.cpython-37m-x86_64-linux-gnu.so
new file mode 100755
index 00000000..33ff3ea9
Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/lib.linux-x86_64-cpython-37/_ext.cpython-37m-x86_64-linux-gnu.so differ
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-3.9/build.ninja b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-3.9/build.ninja
new file mode 100644
index 00000000..ce664703
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-3.9/build.ninja
@@ -0,0 +1,34 @@
+ninja_required_version = 1.3
+cxx = c++
+nvcc = /usr/local/cuda/bin/nvcc
+
+cflags = -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /ssd8/exec/huangjy/miniconda3/envs/vgt/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/include -fPIC -O2 -isystem /ssd8/exec/huangjy/miniconda3/envs/vgt/include -fPIC -DWITH_CUDA -I/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/TH -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/include/python3.9 -c
+post_cflags = -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++14
+cuda_cflags = -DWITH_CUDA -I/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/TH -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/include/python3.9 -c
+cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DCUDA_HAS_FP16=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 -ccbin g++ -std=c++14
+ldflags = 
+
+rule compile
+  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags
+  depfile = $out.d
+  deps = gcc
+
+rule cuda_compile
+  depfile = $out.d
+  deps = gcc
+  command = $nvcc  $cuda_cflags -c $in -o $out $cuda_post_cflags
+
+
+
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-3.9/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.cpp
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-3.9/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.cpp
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-3.9/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.cpp
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-3.9/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o: cuda_compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.cu
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-3.9/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o: cuda_compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.cu
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-3.9/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o: cuda_compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.cu
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-3.9/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.cpp
+
+
+
+
+
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/.ninja_deps b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/.ninja_deps
new file mode 100644
index 00000000..d4d080be
Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/.ninja_deps differ
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/.ninja_log b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/.ninja_log
new file mode 100644
index 00000000..45c44099
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/.ninja_log
@@ -0,0 +1,57 @@
+# ninja log v5
+1	2313	1710137265966278799	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o	22833fbca9713fe0
+1	2816	1710137266468282295	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o	2667d4baf8b5407c
+1	3128	1710137266780284468	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o	e8d861483d5a7377
+2	7548	1710137271198315232	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o	e1e4b0044419bc08
+2	7644	1710137271294315901	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o	da48a83253f181bc
+2	7759	1710137271410316708	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o	35b5b9ec39df087
+2	14817	1710137278463365822	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o	1cbfc7ac6fc9c0f0
+22	2467	1710137281370386064	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o	b7c1efd29df8f47f
+22	2840	1710137281742388655	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o	5ada4b2f1d493b12
+22	2986	1710137281888389671	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o	a2182f7e45c0b54
+22	6892	1710137285793416602	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o	13c07617accbab3a
+22	7101	1710137286002418042	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o	c2b91aa2571f2a
+22	7212	1710137286113418807	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o	985c1e87cefa513c
+22	13054	1710137291950459029	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o	ba9527b6206ee297
+17	2206	1710224953594403051	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o	22833fbca9713fe0
+17	2933	1710224954320407881	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o	2667d4baf8b5407c
+17	3048	1710224954435408646	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o	e8d861483d5a7377
+18	6827	1710224958213433783	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o	e1e4b0044419bc08
+18	7010	1710224958395434994	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o	35b5b9ec39df087
+18	7194	1710224958580436225	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o	da48a83253f181bc
+18	13336	1710224964718477065	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o	1cbfc7ac6fc9c0f0
+16	2403	1710224967513495661	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o	b7c1efd29df8f47f
+16	2656	1710224967765497338	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o	5ada4b2f1d493b12
+16	3000	1710224968108499620	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o	a2182f7e45c0b54
+17	6981	1710224972089526108	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o	c2b91aa2571f2a
+17	7089	1710224972196526820	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o	13c07617accbab3a
+16	7161	1710224972267527292	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o	985c1e87cefa513c
+17	13132	1710224978234567010	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o	ba9527b6206ee297
+36	2507	1710230192773252741	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o	22833fbca9713fe0
+36	3054	1710230193318256641	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o	2667d4baf8b5407c
+36	3315	1710230193580258516	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o	e8d861483d5a7377
+36	7396	1710230197659287706	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o	35b5b9ec39df087
+36	7458	1710230197721288149	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o	e1e4b0044419bc08
+36	7904	1710230198168291348	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o	da48a83253f181bc
+36	14342	1710230204601337383	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o	1cbfc7ac6fc9c0f0
+23	2189	1710230207208356039	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o	b7c1efd29df8f47f
+23	2683	1710230207700359560	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o	5ada4b2f1d493b12
+23	3008	1710230208025361886	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o	a2182f7e45c0b54
+23	6932	1710230211947389952	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o	c2b91aa2571f2a
+23	6960	1710230211977390167	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o	13c07617accbab3a
+23	7106	1710230212122391204	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o	985c1e87cefa513c
+24	14589	1710230219600444723	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o	ba9527b6206ee297
+41	2415	1710474074089677356	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o	22833fbca9713fe0
+41	2915	1710474074588681031	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o	2667d4baf8b5407c
+41	3266	1710474074939683616	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o	e8d861483d5a7377
+42	7080	1710474078751711686	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o	e1e4b0044419bc08
+42	7266	1710474078937713056	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o	35b5b9ec39df087
+42	7358	1710474079029713733	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o	da48a83253f181bc
+42	13563	1710474085230759396	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o	1cbfc7ac6fc9c0f0
+21	2154	1710474087804778350	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o	b7c1efd29df8f47f
+21	2945	1710474088594784168	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o	5ada4b2f1d493b12
+21	3037	1710474088686784845	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o	a2182f7e45c0b54
+21	6778	1710474092426812386	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o	c2b91aa2571f2a
+21	7076	1710474092723814573	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o	13c07617accbab3a
+21	7171	1710474092818815272	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o	985c1e87cefa513c
+21	13186	1710474098828859528	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o	ba9527b6206ee297
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/build.ninja b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/build.ninja
new file mode 100644
index 00000000..38c79261
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/build.ninja
@@ -0,0 +1,32 @@
+ninja_required_version = 1.3
+cxx = c++
+nvcc = /usr/local/cuda/bin/nvcc
+
+cflags = -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -DWITH_CUDA -I/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/TH -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/THC -I/usr/local/cuda/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/include/python3.7m -c
+post_cflags = -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++14 -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+cuda_cflags = -DWITH_CUDA -I/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/TH -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/THC -I/usr/local/cuda/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/include/python3.7m -c
+cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DCUDA_HAS_FP16=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_75,code=sm_75 -std=c++14
+ldflags = 
+
+rule compile
+  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags
+  depfile = $out.d
+  deps = gcc
+
+rule cuda_compile
+  command = $nvcc $cuda_cflags -c $in -o $out $cuda_post_cflags
+
+
+
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.cpp
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.cpp
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.cpp
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o: cuda_compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.cu
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o: cuda_compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.cu
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o: cuda_compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.cu
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.cpp
+
+
+
+
+
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o
new file mode 100644
index 00000000..1f8ea657
Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.o differ
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o
new file mode 100644
index 00000000..cf177c40
Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.o differ
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o
new file mode 100644
index 00000000..a3a70b44
Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.o differ
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o
new file mode 100644
index 00000000..de014f21
Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.o differ
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o
new file mode 100644
index 00000000..f8b74ce4
Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.o differ
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o
new file mode 100644
index 00000000..052038c4
Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.o differ
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o
new file mode 100644
index 00000000..fe667a45
Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2/src/vision.o differ
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/dcn_v2.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/dcn_v2.py
new file mode 100644
index 00000000..1b3ae6ec
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/dcn_v2.py
@@ -0,0 +1,396 @@
+#!/usr/bin/env python
+from __future__ import absolute_import, division, print_function
+
+import math
+
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+import _ext as _backend
+
+
+class _DCNv2(Function):
+    @staticmethod
+    def forward(
+        ctx, input, offset, mask, weight, bias, stride, padding, dilation, deformable_groups
+    ):
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.kernel_size = _pair(weight.shape[2:4])
+        ctx.deformable_groups = deformable_groups
+        output = _backend.dcn_v2_forward(
+            input,
+            weight,
+            bias,
+            offset,
+            mask,
+            ctx.kernel_size[0],
+            ctx.kernel_size[1],
+            ctx.stride[0],
+            ctx.stride[1],
+            ctx.padding[0],
+            ctx.padding[1],
+            ctx.dilation[0],
+            ctx.dilation[1],
+            ctx.deformable_groups,
+        )
+        ctx.save_for_backward(input, offset, mask, weight, bias)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, offset, mask, weight, bias = ctx.saved_tensors
+        grad_input, grad_offset, grad_mask, grad_weight, grad_bias = _backend.dcn_v2_backward(
+            input,
+            weight,
+            bias,
+            offset,
+            mask,
+            grad_output,
+            ctx.kernel_size[0],
+            ctx.kernel_size[1],
+            ctx.stride[0],
+            ctx.stride[1],
+            ctx.padding[0],
+            ctx.padding[1],
+            ctx.dilation[0],
+            ctx.dilation[1],
+            ctx.deformable_groups,
+        )
+
+        return grad_input, grad_offset, grad_mask, grad_weight, grad_bias, None, None, None, None
+
+    @staticmethod
+    def symbolic(
+        g, input, offset, mask, weight, bias, stride, padding, dilation, deformable_groups
+    ):
+        from torch.nn.modules.utils import _pair
+
+        stride = _pair(stride)
+        padding = _pair(padding)
+        dilation = _pair(dilation)
+        # as of trt 7, the dcn operation will be translated again by modifying the onnx file
+        # so the exporting code is kept to resemble the forward()
+        return g.op(
+            "ai.onnx.contrib::_DCNv2_2",
+            input,
+            offset,
+            mask,
+            weight,
+            bias,
+            stride_i=stride,
+            padding_i=padding,
+            dilation_i=dilation,
+            deformable_groups_i=deformable_groups,
+        )
+
+
+dcn_v2_conv = _DCNv2.apply
+
+
+class DCNv2(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation=1,
+        deformable_groups=1,
+    ):
+        super(DCNv2, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.deformable_groups = deformable_groups
+
+        self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels, *self.kernel_size))
+        self.bias = nn.Parameter(torch.Tensor(out_channels))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1.0 / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+        self.bias.data.zero_()
+
+    def forward(self, input, offset, mask):
+        assert (
+            2 * self.deformable_groups * self.kernel_size[0] * self.kernel_size[1]
+            == offset.shape[1]
+        )
+        assert self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == mask.shape[1]
+        return dcn_v2_conv(
+            input,
+            offset,
+            mask,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.deformable_groups,
+        )
+
+
+class DCN(DCNv2):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation=1,
+        deformable_groups=1,
+    ):
+        super(DCN, self).__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation, deformable_groups
+        )
+
+        channels_ = self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1]
+        self.conv_offset_mask = nn.Conv2d(
+            self.in_channels,
+            channels_,
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            padding=self.padding,
+            bias=True,
+        )
+        self.init_offset()
+
+    def init_offset(self):
+        self.conv_offset_mask.weight.data.zero_()
+        self.conv_offset_mask.bias.data.zero_()
+
+    def forward(self, input):
+        out = self.conv_offset_mask(input)
+        o1, o2, mask = torch.chunk(out, 3, dim=1)
+        offset = torch.cat((o1, o2), dim=1)
+        mask = torch.sigmoid(mask)
+        return dcn_v2_conv(
+            input,
+            offset,
+            mask,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.deformable_groups,
+        )
+
+
+class _DCNv2Pooling(Function):
+    @staticmethod
+    def forward(
+        ctx,
+        input,
+        rois,
+        offset,
+        spatial_scale,
+        pooled_size,
+        output_dim,
+        no_trans,
+        group_size=1,
+        part_size=None,
+        sample_per_part=4,
+        trans_std=0.0,
+    ):
+        ctx.spatial_scale = spatial_scale
+        ctx.no_trans = int(no_trans)
+        ctx.output_dim = output_dim
+        ctx.group_size = group_size
+        ctx.pooled_size = pooled_size
+        ctx.part_size = pooled_size if part_size is None else part_size
+        ctx.sample_per_part = sample_per_part
+        ctx.trans_std = trans_std
+
+        output, output_count = _backend.dcn_v2_psroi_pooling_forward(
+            input,
+            rois,
+            offset,
+            ctx.no_trans,
+            ctx.spatial_scale,
+            ctx.output_dim,
+            ctx.group_size,
+            ctx.pooled_size,
+            ctx.part_size,
+            ctx.sample_per_part,
+            ctx.trans_std,
+        )
+        ctx.save_for_backward(input, rois, offset, output_count)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, rois, offset, output_count = ctx.saved_tensors
+        grad_input, grad_offset = _backend.dcn_v2_psroi_pooling_backward(
+            grad_output,
+            input,
+            rois,
+            offset,
+            output_count,
+            ctx.no_trans,
+            ctx.spatial_scale,
+            ctx.output_dim,
+            ctx.group_size,
+            ctx.pooled_size,
+            ctx.part_size,
+            ctx.sample_per_part,
+            ctx.trans_std,
+        )
+
+        return grad_input, None, grad_offset, None, None, None, None, None, None, None, None
+
+
+dcn_v2_pooling = _DCNv2Pooling.apply
+
+
+class DCNv2Pooling(nn.Module):
+    def __init__(
+        self,
+        spatial_scale,
+        pooled_size,
+        output_dim,
+        no_trans,
+        group_size=1,
+        part_size=None,
+        sample_per_part=4,
+        trans_std=0.0,
+    ):
+        super(DCNv2Pooling, self).__init__()
+        self.spatial_scale = spatial_scale
+        self.pooled_size = pooled_size
+        self.output_dim = output_dim
+        self.no_trans = no_trans
+        self.group_size = group_size
+        self.part_size = pooled_size if part_size is None else part_size
+        self.sample_per_part = sample_per_part
+        self.trans_std = trans_std
+
+    def forward(self, input, rois, offset):
+        assert input.shape[1] == self.output_dim
+        if self.no_trans:
+            offset = input.new()
+        return dcn_v2_pooling(
+            input,
+            rois,
+            offset,
+            self.spatial_scale,
+            self.pooled_size,
+            self.output_dim,
+            self.no_trans,
+            self.group_size,
+            self.part_size,
+            self.sample_per_part,
+            self.trans_std,
+        )
+
+
+class DCNPooling(DCNv2Pooling):
+    def __init__(
+        self,
+        spatial_scale,
+        pooled_size,
+        output_dim,
+        no_trans,
+        group_size=1,
+        part_size=None,
+        sample_per_part=4,
+        trans_std=0.0,
+        deform_fc_dim=1024,
+    ):
+        super(DCNPooling, self).__init__(
+            spatial_scale,
+            pooled_size,
+            output_dim,
+            no_trans,
+            group_size,
+            part_size,
+            sample_per_part,
+            trans_std,
+        )
+
+        self.deform_fc_dim = deform_fc_dim
+
+        if not no_trans:
+            self.offset_mask_fc = nn.Sequential(
+                nn.Linear(
+                    self.pooled_size * self.pooled_size * self.output_dim, self.deform_fc_dim
+                ),
+                nn.ReLU(inplace=True),
+                nn.Linear(self.deform_fc_dim, self.deform_fc_dim),
+                nn.ReLU(inplace=True),
+                nn.Linear(self.deform_fc_dim, self.pooled_size * self.pooled_size * 3),
+            )
+            self.offset_mask_fc[4].weight.data.zero_()
+            self.offset_mask_fc[4].bias.data.zero_()
+
+    def forward(self, input, rois):
+        offset = input.new()
+
+        if not self.no_trans:
+
+            # do roi_align first
+            n = rois.shape[0]
+            roi = dcn_v2_pooling(
+                input,
+                rois,
+                offset,
+                self.spatial_scale,
+                self.pooled_size,
+                self.output_dim,
+                True,  # no trans
+                self.group_size,
+                self.part_size,
+                self.sample_per_part,
+                self.trans_std,
+            )
+
+            # build mask and offset
+            offset_mask = self.offset_mask_fc(roi.view(n, -1))
+            offset_mask = offset_mask.view(n, 3, self.pooled_size, self.pooled_size)
+            o1, o2, mask = torch.chunk(offset_mask, 3, dim=1)
+            offset = torch.cat((o1, o2), dim=1)
+            mask = torch.sigmoid(mask)
+
+            # do pooling with offset and mask
+            return (
+                dcn_v2_pooling(
+                    input,
+                    rois,
+                    offset,
+                    self.spatial_scale,
+                    self.pooled_size,
+                    self.output_dim,
+                    self.no_trans,
+                    self.group_size,
+                    self.part_size,
+                    self.sample_per_part,
+                    self.trans_std,
+                )
+                * mask
+            )
+        # only roi_align
+        return dcn_v2_pooling(input, rois, offset,
+                              self.spatial_scale,
+                              self.pooled_size,
+                              self.output_dim,
+                              self.no_trans,
+                              self.group_size,
+                              self.part_size,
+                              self.sample_per_part,
+                              self.trans_std)
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/make.sh b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/make.sh
new file mode 100755
index 00000000..55f5587e
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/make.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+sudo rm *.so
+sudo rm -r build/
+sudo python3 setup.py build develop
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/setup.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/setup.py
new file mode 100644
index 00000000..ad9814a4
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/setup.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+
+import glob
+import os
+
+import torch
+from setuptools import find_packages, setup
+from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
+
+requirements = ["torch", "torchvision"]
+
+
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "src")
+
+    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
+    source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
+    os.environ["CC"] = "g++"
+    sources = main_file + source_cpu
+    extension = CppExtension
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+
+    
+    if torch.cuda.is_available() and CUDA_HOME is not None:
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+    else:
+        # raise NotImplementedError('Cuda is not available')
+        pass
+
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+    include_dirs = [extensions_dir]
+    ext_modules = [
+        extension(
+            "_ext",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+    return ext_modules
+
+
+setup(
+    name="DCNv2",
+    version="0.1",
+    author="charlesshang",
+    url="https://github.com/charlesshang/DCNv2",
+    description="deformable convolutional networks",
+    packages=find_packages(exclude=("configs", "tests")),
+    # install_requires=requirements,
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.cpp b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.cpp
new file mode 100644
index 00000000..76c65f01
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.cpp
@@ -0,0 +1,233 @@
+#include <vector>
+#include "cpu/dcn_v2_im2col_cpu.h"
+
+#include <ATen/ATen.h>
+//#include <ATen/cuda/CUDAContext.h>
+
+#include <TH/TH.h>
+//#include <THC/THCAtomics.cuh>
+//#include <THC/THCDeviceUtils.cuh>
+
+//extern THCState *state;
+
+// author: Charles Shang
+// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
+// modified from the CUDA version for CPU use by Daniel K. Suhendro
+
+at::Tensor
+dcn_v2_cpu_forward(const at::Tensor &input,
+                    const at::Tensor &weight,
+                    const at::Tensor &bias,
+                    const at::Tensor &offset,
+                    const at::Tensor &mask,
+                    const int kernel_h,
+                    const int kernel_w,
+                    const int stride_h,
+                    const int stride_w,
+                    const int pad_h,
+                    const int pad_w,
+                    const int dilation_h,
+                    const int dilation_w,
+                    const int deformable_group)
+{
+    // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask));
+    /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+    AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
+    AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
+    AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
+    AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");*/
+
+    const int batch = input.size(0);
+    const int channels = input.size(1);
+    const int height = input.size(2);
+    const int width = input.size(3);
+
+    const int channels_out = weight.size(0);
+    const int channels_kernel = weight.size(1);
+    const int kernel_h_ = weight.size(2);
+    const int kernel_w_ = weight.size(3);
+
+    // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h);
+    // printf("Channels: %d %d\n", channels, channels_kernel);
+    // printf("Channels: %d %d\n", channels_out, channels_kernel);
+
+    AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
+               "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
+
+    AT_ASSERTM(channels == channels_kernel,
+               "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
+
+    const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+    const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+    auto ones = at::ones({height_out, width_out}, input.options());
+    auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
+    auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
+
+    using scalar_t = float;
+    for (int b = 0; b < batch; b++)
+    {
+        auto input_n = input.select(0, b);
+        auto offset_n = offset.select(0, b);
+        auto mask_n = mask.select(0, b);
+        auto output_n = output.select(0, b);
+
+        // Do Bias first:
+        // M,N,K are dims of matrix A and B
+        // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+        // (N x 1) (1 x M)
+        long m_ = channels_out;
+        long n_ = height_out * width_out;
+        long k_ = 1;
+        THFloatBlas_gemm('t', 'n', n_, m_, k_, 1.0f,
+                         ones.contiguous().data<scalar_t>(), k_,
+                         bias.contiguous().data<scalar_t>(), k_, 0.0f,
+                         output_n.data<scalar_t>(), n_);
+
+        modulated_deformable_im2col_cpu(input_n.data<scalar_t>(),
+                                         offset_n.data<scalar_t>(),
+                                         mask_n.data<scalar_t>(),
+                                         1, channels, height, width,
+                                         height_out, width_out, kernel_h, kernel_w,
+                                         pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                                         deformable_group,
+                                         columns.data<scalar_t>());
+
+        //(k * m)  x  (m * n)
+        // Y = WC
+        long m = channels_out;
+        long n = height_out * width_out;
+        long k = channels * kernel_h * kernel_w;
+        THFloatBlas_gemm('n', 'n', n, m, k, 1.0f,
+                         columns.data<scalar_t>(), n,
+                         weight.data<scalar_t>(), k, 1.0f,
+                         output_n.data<scalar_t>(), n);
+    }
+    return output;
+}
+
+std::vector<at::Tensor> dcn_v2_cpu_backward(const at::Tensor &input,
+                                             const at::Tensor &weight,
+                                             const at::Tensor &bias,
+                                             const at::Tensor &offset,
+                                             const at::Tensor &mask,
+                                             const at::Tensor &grad_output,
+                                             int kernel_h, int kernel_w,
+                                             int stride_h, int stride_w,
+                                             int pad_h, int pad_w,
+                                             int dilation_h, int dilation_w,
+                                             int deformable_group)
+{
+
+    THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous");
+    THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous");
+
+    /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+    AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
+    AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
+    AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
+    AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");*/
+
+    const int batch = input.size(0);
+    const int channels = input.size(1);
+    const int height = input.size(2);
+    const int width = input.size(3);
+
+    const int channels_out = weight.size(0);
+    const int channels_kernel = weight.size(1);
+    const int kernel_h_ = weight.size(2);
+    const int kernel_w_ = weight.size(3);
+
+    AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
+               "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
+
+    AT_ASSERTM(channels == channels_kernel,
+               "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
+
+    const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+    const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+    auto ones = at::ones({height_out, width_out}, input.options());
+    auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
+    auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
+
+    auto grad_input = at::zeros_like(input);
+    auto grad_weight = at::zeros_like(weight);
+    auto grad_bias = at::zeros_like(bias);
+    auto grad_offset = at::zeros_like(offset);
+    auto grad_mask = at::zeros_like(mask);
+
+    using scalar_t = float;
+
+    for (int b = 0; b < batch; b++)
+    {
+        auto input_n = input.select(0, b);
+        auto offset_n = offset.select(0, b);
+        auto mask_n = mask.select(0, b);
+        auto grad_output_n = grad_output.select(0, b);
+        auto grad_input_n = grad_input.select(0, b);
+        auto grad_offset_n = grad_offset.select(0, b);
+        auto grad_mask_n = grad_mask.select(0, b);
+
+        long m = channels * kernel_h * kernel_w;
+        long n = height_out * width_out;
+        long k = channels_out;
+
+        THFloatBlas_gemm('n', 't', n, m, k, 1.0f,
+                         grad_output_n.data<scalar_t>(), n,
+                         weight.data<scalar_t>(), m, 0.0f,
+                         columns.data<scalar_t>(), n);
+
+        // gradient w.r.t. input coordinate data
+        modulated_deformable_col2im_coord_cpu(columns.data<scalar_t>(),
+                                               input_n.data<scalar_t>(),
+                                               offset_n.data<scalar_t>(),
+                                               mask_n.data<scalar_t>(),
+                                               1, channels, height, width,
+                                               height_out, width_out, kernel_h, kernel_w,
+                                               pad_h, pad_w, stride_h, stride_w,
+                                               dilation_h, dilation_w, deformable_group,
+                                               grad_offset_n.data<scalar_t>(),
+                                               grad_mask_n.data<scalar_t>());
+        // gradient w.r.t. input data
+        modulated_deformable_col2im_cpu(columns.data<scalar_t>(),
+                                         offset_n.data<scalar_t>(),
+                                         mask_n.data<scalar_t>(),
+                                         1, channels, height, width,
+                                         height_out, width_out, kernel_h, kernel_w,
+                                         pad_h, pad_w, stride_h, stride_w,
+                                         dilation_h, dilation_w, deformable_group,
+                                         grad_input_n.data<scalar_t>());
+
+        // gradient w.r.t. weight, dWeight should accumulate across the batch and group
+        modulated_deformable_im2col_cpu(input_n.data<scalar_t>(),
+                                         offset_n.data<scalar_t>(),
+                                         mask_n.data<scalar_t>(),
+                                         1, channels, height, width,
+                                         height_out, width_out, kernel_h, kernel_w,
+                                         pad_h, pad_w, stride_h, stride_w,
+                                         dilation_h, dilation_w, deformable_group,
+                                         columns.data<scalar_t>());
+
+        long m_ = channels_out;
+        long n_ = channels * kernel_h * kernel_w;
+        long k_ = height_out * width_out;
+
+        THFloatBlas_gemm('t', 'n', n_, m_, k_, 1.0f,
+                         columns.data<scalar_t>(), k_,
+                         grad_output_n.data<scalar_t>(), k_, 1.0f,
+                         grad_weight.data<scalar_t>(), n_);
+
+        // gradient w.r.t. bias
+        // long m_ = channels_out;
+        // long k__ = height_out * width_out;
+        // THFloatBlas_gemv('t', k_, m_, 1.0f,
+        //                  grad_output_n.data<scalar_t>(), k_,
+        //                  ones.data<scalar_t>(), 1, 1.0f,
+        //                  grad_bias.data<scalar_t>(), 1);
+    }
+
+    return {
+        grad_input, grad_offset, grad_mask, grad_weight, grad_bias
+    };
+}
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.cpp b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.cpp
new file mode 100644
index 00000000..1704a60d
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.cpp
@@ -0,0 +1,395 @@
+#include "dcn_v2_im2col_cpu.h"
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+
+#include <ATen/ATen.h>
+//#include <ATen/cuda/CUDAContext.h>
+
+#include <TH/TH.h>
+//#include <THC/THCAtomics.cuh>
+//#include <THC/THCDeviceUtils.cuh>
+
+// modified from the CUDA version for CPU use by Daniel K. Suhendro
+
+/*#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N)
+{
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}*/
+
+
+float dmcn_im2col_bilinear_cpu(const float *bottom_data, const int data_width,
+                           const int height, const int width, float h, float w)
+{
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  float lh = h - h_low;
+  float lw = w - w_low;
+  float hh = 1 - lh, hw = 1 - lw;
+
+  float v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  float v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  float v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  float v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+float dmcn_get_gradient_weight_cpu(float argmax_h, float argmax_w,
+                               const int h, const int w, const int height, const int width)
+{
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  float weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+float dmcn_get_coordinate_weight_cpu(float argmax_h, float argmax_w,
+                                 const int height, const int width, const float *im_data,
+                                 const int data_width, const int bp_dir)
+{
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  float weight = 0;
+
+  if (bp_dir == 0)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+  else if (bp_dir == 1)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+void modulated_deformable_im2col_cpu_kernel(const int n, const float *data_im, const float *data_offset, const float *data_mask,
+                                                       const int height, const int width, const int kernel_h, const int kernel_w,
+                                                       const int pad_h, const int pad_w,
+                                                       const int stride_h, const int stride_w,
+                                                       const int dilation_h, const int dilation_w,
+                                                       const int channel_per_deformable_group,
+                                                       const int batch_size, const int num_channels, const int deformable_group,
+                                                       const int height_col, const int width_col,
+                                                       float *data_col)
+{
+  // launch channels * batch_size * height_col * width_col cores
+  for(int index=0; index<n; index++)
+  {
+    // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow)
+    // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis
+
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    // const int b_col = (index / width_col / height_col) % batch_size;
+    const int b_col = (index / width_col / height_col / num_channels) % batch_size;
+    // const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_im = (index / width_col / height_col) % num_channels;
+    // const int c_col = c_im * kernel_h * kernel_w;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    //  float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col;
+    //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
+    const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
+    const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+
+    const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i)
+    {
+      for (int j = 0; j < kernel_w; ++j)
+      {
+        const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+        const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const float offset_h = data_offset_ptr[data_offset_h_ptr];
+        const float offset_w = data_offset_ptr[data_offset_w_ptr];
+        const float mask = data_mask_ptr[data_mask_hw_ptr];
+        float val = static_cast<float>(0);
+        const float h_im = h_in + i * dilation_h + offset_h;
+        const float w_im = w_in + j * dilation_w + offset_w;
+        //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+        {
+          //const float map_h = i * dilation_h + offset_h;
+          //const float map_w = j * dilation_w + offset_w;
+          //const int cur_height = height - h_in;
+          //const int cur_width = width - w_in;
+          //val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
+          val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        // data_col_ptr += batch_size * height_col * width_col;
+        data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+void modulated_deformable_col2im_cpu_kernel(const int n, const float *data_col, const float *data_offset, const float *data_mask,
+                                                       const int channels, const int height, const int width,
+                                                       const int kernel_h, const int kernel_w,
+                                                       const int pad_h, const int pad_w,
+                                                       const int stride_h, const int stride_w,
+                                                       const int dilation_h, const int dilation_w,
+                                                       const int channel_per_deformable_group,
+                                                       const int batch_size, const int deformable_group,
+                                                       const int height_col, const int width_col,
+                                                       float *grad_im)
+{
+  for(int index = 0; index < n; index++)
+  {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+    const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const float offset_h = data_offset_ptr[data_offset_h_ptr];
+    const float offset_w = data_offset_ptr[data_offset_w_ptr];
+    const float mask = data_mask_ptr[data_mask_hw_ptr];
+    const float cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const float cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const float cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    
+    for (int dy = -2; dy <= 2; dy++)
+    {
+      for (int dx = -2; dx <= 2; dx++)
+      {
+        if (cur_h + dy >= 0 && cur_h + dy < height &&
+            cur_w + dx >= 0 && cur_w + dx < width &&
+            abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1)
+        {
+          int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          float weight = dmcn_get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
+          //atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
+
+        }
+      }
+    }
+  }
+}
+
+void modulated_deformable_col2im_coord_cpu_kernel(const int n, const float *data_col, const float *data_im,
+                                                             const float *data_offset, const float *data_mask,
+                                                             const int channels, const int height, const int width,
+                                                             const int kernel_h, const int kernel_w,
+                                                             const int pad_h, const int pad_w,
+                                                             const int stride_h, const int stride_w,
+                                                             const int dilation_h, const int dilation_w,
+                                                             const int channel_per_deformable_group,
+                                                             const int batch_size, const int offset_channels, const int deformable_group,
+                                                             const int height_col, const int width_col,
+                                                             float *grad_offset, float *grad_mask)
+{
+  for(int index = 0; index < n; index++)
+  {
+    float val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
+    const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+    const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
+    {
+      const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
+      const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const float offset_h = data_offset_ptr[data_offset_h_ptr];
+      const float offset_w = data_offset_ptr[data_offset_w_ptr];
+      const float mask = data_mask_ptr[data_mask_hw_ptr];
+      float inv_h = h_in + i * dilation_h + offset_h;
+      float inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+      {
+        inv_h = inv_w = -2;
+      }
+      else
+      {
+        mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear_cpu(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
+      }
+      const float weight = dmcn_get_coordinate_weight_cpu(
+          inv_h, inv_w,
+          height, width, data_im_ptr + cnt * height * width, width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
+  }
+}
+
+void modulated_deformable_im2col_cpu(const float* data_im, const float* data_offset, const float* data_mask,
+  const int batch_size, const int channels, const int height_im, const int width_im, 
+  const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+  const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
+  const int dilation_h, const int dilation_w,
+  const int deformable_group, float* data_col) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+  modulated_deformable_im2col_cpu_kernel(
+      num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w,
+      pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+      batch_size, channels, deformable_group, height_col, width_col, data_col);
+  
+  /*cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }*/
+
+}
+
+void modulated_deformable_col2im_cpu(const float* data_col, const float* data_offset, const float* data_mask,
+  const int batch_size, const int channels, const int height_im, const int width_im, 
+  const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+  const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
+  const int dilation_h, const int dilation_w, 
+  const int deformable_group, float* grad_im){
+
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+  modulated_deformable_col2im_cpu_kernel(
+        num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im,
+        kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
+        dilation_h, dilation_w, channel_per_deformable_group,
+        batch_size, deformable_group, height_col, width_col, grad_im);
+  /*cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }*/
+
+}
+
+void modulated_deformable_col2im_coord_cpu(const float* data_col, const float* data_im, const float* data_offset, const float* data_mask,
+  const int batch_size, const int channels, const int height_im, const int width_im, 
+  const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+  const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
+  const int dilation_h, const int dilation_w, 
+  const int deformable_group,
+  float* grad_offset, float* grad_mask) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
+  const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
+  modulated_deformable_col2im_coord_cpu_kernel(
+        num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im,
+        kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, channel_per_deformable_group,
+        batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, 
+        grad_offset, grad_mask);
+  /*cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
+  }*/
+}
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.h b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.h
new file mode 100644
index 00000000..bad5c528
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_im2col_cpu.h
@@ -0,0 +1,99 @@
+
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.h
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1811.11168
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu
+ */
+
+/***************** Adapted by Charles Shang *********************/
+// modified from the CUDA version for CPU use by Daniel K. Suhendro
+
+#ifndef DCN_V2_IM2COL_CPU
+#define DCN_V2_IM2COL_CPU
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  void modulated_deformable_im2col_cpu(const float *data_im, const float *data_offset, const float *data_mask,
+                                        const int batch_size, const int channels, const int height_im, const int width_im,
+                                        const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+                                        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+                                        const int dilation_h, const int dilation_w,
+                                        const int deformable_group, float *data_col);
+
+  void modulated_deformable_col2im_cpu(const float *data_col, const float *data_offset, const float *data_mask,
+                                        const int batch_size, const int channels, const int height_im, const int width_im,
+                                        const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+                                        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+                                        const int dilation_h, const int dilation_w,
+                                        const int deformable_group, float *grad_im);
+
+  void modulated_deformable_col2im_coord_cpu(const float *data_col, const float *data_im, const float *data_offset, const float *data_mask,
+                                         const int batch_size, const int channels, const int height_im, const int width_im,
+                                         const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+                                         const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+                                         const int dilation_h, const int dilation_w,
+                                         const int deformable_group,
+                                         float *grad_offset, float *grad_mask);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.cpp b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.cpp
new file mode 100644
index 00000000..6e41aaed
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/dcn_v2_psroi_pooling_cpu.cpp
@@ -0,0 +1,426 @@
+/*!
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file deformable_psroi_pooling.cu
+ * \brief
+ * \author Yi Li, Guodong Zhang, Jifeng Dai
+*/
+/***************** Adapted by Charles Shang *********************/
+// modified from the CUDA version for CPU use by Daniel K. Suhendro
+
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+
+#include <ATen/ATen.h>
+//#include <ATen/cuda/CUDAContext.h>
+
+#include <TH/TH.h>
+//#include <THC/THCAtomics.cuh>
+//#include <THC/THCDeviceUtils.cuh>
+
+/*#define CUDA_KERNEL_LOOP(i, n)                        \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+       i < (n);                                       \
+       i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N)
+{
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}*/
+
+template <typename T>
+T bilinear_interp_cpu(
+    const T *data,
+    const T x,
+    const T y,
+    const int width,
+    const int height)
+{
+  int x1 = floor(x);
+  int x2 = ceil(x);
+  int y1 = floor(y);
+  int y2 = ceil(y);
+  T dist_x = static_cast<T>(x - x1);
+  T dist_y = static_cast<T>(y - y1);
+  T value11 = data[y1 * width + x1];
+  T value12 = data[y2 * width + x1];
+  T value21 = data[y1 * width + x2];
+  T value22 = data[y2 * width + x2];
+  T value = (1 - dist_x) * (1 - dist_y) * value11 +
+            (1 - dist_x) * dist_y * value12 +
+            dist_x * (1 - dist_y) * value21 +
+            dist_x * dist_y * value22;
+  return value;
+}
+
+template <typename T>
+ void DeformablePSROIPoolForwardKernelCpu(
+    const int count,
+    const T *bottom_data,
+    const T spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const T *bottom_rois, const T *bottom_trans,
+    const int no_trans,
+    const T trans_std,
+    const int sample_per_part,
+    const int output_dim,
+    const int group_size,
+    const int part_size,
+    const int num_classes,
+    const int channels_each_class,
+    T *top_data,
+    T *top_count)
+{
+  for(int index = 0; index < count; index++)
+  {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const T *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+    // Force too small ROIs to be 1x1
+    T roi_width = std::max(roi_end_w - roi_start_w, T(0.1)); //avoid 0
+    T roi_height = std::max(roi_end_h - roi_start_h, T(0.1));
+
+    // Compute w and h at bottom
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
+    T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
+
+    int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
+    int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
+    T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
+
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    T sum = 0;
+    int count = 0;
+    int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
+    int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
+    gw = std::min(std::max(gw, 0), group_size - 1);
+    gh = std::min(std::max(gh, 0), group_size - 1);
+
+    const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
+    for (int ih = 0; ih < sample_per_part; ih++)
+    {
+      for (int iw = 0; iw < sample_per_part; iw++)
+      {
+        T w = wstart + iw * sub_bin_size_w;
+        T h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+        {
+          continue;
+        }
+        w = std::min(std::max(w, T(0.)), width - T(1.));
+        h = std::min(std::max(h, T(0.)), height - T(1.));
+        int c = (ctop * group_size + gh) * group_size + gw;
+        T val = bilinear_interp_cpu(offset_bottom_data + c * height * width, w, h, width, height);
+        sum += val;
+        count++;
+      }
+    }
+    top_data[index] = count == 0 ? static_cast<T>(0) : sum / count;
+    top_count[index] = count;
+  }
+}
+
+template <typename T>
+void DeformablePSROIPoolBackwardAccKernelCpu(
+    const int count,
+    const T *top_diff,
+    const T *top_count,
+    const int num_rois,
+    const T spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const int output_dim,
+    T *bottom_data_diff, T *bottom_trans_diff,
+    const T *bottom_data,
+    const T *bottom_rois,
+    const T *bottom_trans,
+    const int no_trans,
+    const T trans_std,
+    const int sample_per_part,
+    const int group_size,
+    const int part_size,
+    const int num_classes,
+    const int channels_each_class)
+{
+  for(int index = 0; index < count; index++)
+  {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const T *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+    
+    // Force too small ROIs to be 1x1
+    T roi_width = std::max(roi_end_w - roi_start_w, T(0.1)); //avoid 0
+    T roi_height = std::max(roi_end_h - roi_start_h, T(0.1));
+
+    // Compute w and h at bottom
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
+    T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
+
+    int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
+    int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
+    T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
+
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    if (top_count[index] <= 0)
+    {
+      continue;
+    }
+    T diff_val = top_diff[index] / top_count[index];
+    const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
+    T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
+    int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
+    int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
+    gw = std::min(std::max(gw, 0), group_size - 1);
+    gh = std::min(std::max(gh, 0), group_size - 1);
+
+    for (int ih = 0; ih < sample_per_part; ih++)
+    {
+      for (int iw = 0; iw < sample_per_part; iw++)
+      {
+        T w = wstart + iw * sub_bin_size_w;
+        T h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+        {
+          continue;
+        }
+        w = std::min(std::max(w, T(0.)), width - T(1.));
+        h = std::min(std::max(h, T(0.)), height - T(1.));
+        int c = (ctop * group_size + gh) * group_size + gw;
+        // backward on feature
+        int x0 = floor(w);
+        int x1 = ceil(w);
+        int y0 = floor(h);
+        int y1 = ceil(h);
+        T dist_x = w - x0, dist_y = h - y0;
+        T q00 = (1 - dist_x) * (1 - dist_y);
+        T q01 = (1 - dist_x) * dist_y;
+        T q10 = dist_x * (1 - dist_y);
+        T q11 = dist_x * dist_y;
+        int bottom_index_base = c * height * width;
+        /*atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);*/
+       *(offset_bottom_data_diff + bottom_index_base + y0 * width + x0) += q00 * diff_val;
+       *(offset_bottom_data_diff + bottom_index_base + y1 * width + x0) += q01 * diff_val;
+       *(offset_bottom_data_diff + bottom_index_base + y0 * width + x1) += q10 * diff_val;
+       *(offset_bottom_data_diff + bottom_index_base + y1 * width + x1) += q11 * diff_val;
+
+
+        if (no_trans)
+        {
+          continue;
+        }
+        T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
+        T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
+        T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
+        T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
+        T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
+        diff_x *= roi_width;
+        T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
+        diff_y *= roi_height;
+
+        /*atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
+        atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);*/
+        *(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w) += diff_x;
+        *(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w) += diff_y;
+      }
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input,
+                                  const at::Tensor &bbox,
+                                  const at::Tensor &trans,
+                                  const int no_trans,
+                                  const float spatial_scale,
+                                  const int output_dim,
+                                  const int group_size,
+                                  const int pooled_size,
+                                  const int part_size,
+                                  const int sample_per_part,
+                                  const float trans_std)
+{
+  /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor");
+  AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");*/
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+  const int num_bbox = bbox.size(0);
+
+  AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
+  auto pooled_height = pooled_size;
+  auto pooled_width = pooled_size;
+
+  auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
+  long out_size = num_bbox * output_dim * pooled_height * pooled_width;
+  auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
+
+  const int num_classes = no_trans ? 1 : channels_trans / 2;
+  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+  //cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (out.numel() == 0)
+  {
+    //THCudaCheck(cudaGetLastError());
+    return std::make_tuple(out, top_count);
+  }
+
+  /*dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
+  dim3 block(512);*/
+
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cpu_forward", [&] {
+    DeformablePSROIPoolForwardKernelCpu<scalar_t>(
+        out_size,
+        input.contiguous().data<scalar_t>(),
+        spatial_scale,
+        channels,
+        height, width,
+        pooled_height,
+        pooled_width,
+        bbox.contiguous().data<scalar_t>(),
+        trans.contiguous().data<scalar_t>(),
+        no_trans,
+        trans_std,
+        sample_per_part,
+        output_dim,
+        group_size,
+        part_size,
+        num_classes,
+        channels_each_class,
+        out.data<scalar_t>(),
+        top_count.data<scalar_t>());
+  });
+  //THCudaCheck(cudaGetLastError());
+  return std::make_tuple(out, top_count);
+}
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad,
+                                   const at::Tensor &input,
+                                   const at::Tensor &bbox,
+                                   const at::Tensor &trans,
+                                   const at::Tensor &top_count,
+                                   const int no_trans,
+                                   const float spatial_scale,
+                                   const int output_dim,
+                                   const int group_size,
+                                   const int pooled_size,
+                                   const int part_size,
+                                   const int sample_per_part,
+                                   const float trans_std)
+{
+  /*AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor");
+  AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor");
+  AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");
+  AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor");*/
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+  const int num_bbox = bbox.size(0);
+
+  AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
+  auto pooled_height = pooled_size;
+  auto pooled_width = pooled_size;
+  long out_size = num_bbox * output_dim * pooled_height * pooled_width;
+  const int num_classes = no_trans ? 1 : channels_trans / 2;
+  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+  auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options());
+  auto trans_grad = at::zeros_like(trans);
+
+  if (input_grad.numel() == 0)
+  {
+    //THCudaCheck(cudaGetLastError());
+    return std::make_tuple(input_grad, trans_grad);
+  }
+
+  /*dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
+  dim3 block(512);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();*/
+
+  AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cpu_backward", [&] {
+    DeformablePSROIPoolBackwardAccKernelCpu<scalar_t>(
+        out_size,
+        out_grad.contiguous().data<scalar_t>(),
+        top_count.contiguous().data<scalar_t>(),
+        num_bbox,
+        spatial_scale,
+        channels,
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        output_dim,
+        input_grad.contiguous().data<scalar_t>(),
+        trans_grad.contiguous().data<scalar_t>(),
+        input.contiguous().data<scalar_t>(),
+        bbox.contiguous().data<scalar_t>(),
+        trans.contiguous().data<scalar_t>(),
+        no_trans,
+        trans_std,
+        sample_per_part,
+        group_size,
+        part_size,
+        num_classes,
+        channels_each_class);
+  });
+  //THCudaCheck(cudaGetLastError());
+  return std::make_tuple(input_grad, trans_grad);
+}
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/vision.h b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/vision.h
new file mode 100644
index 00000000..d5fbf1f0
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cpu/vision.h
@@ -0,0 +1,60 @@
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor
+dcn_v2_cpu_forward(const at::Tensor &input,
+                    const at::Tensor &weight,
+                    const at::Tensor &bias,
+                    const at::Tensor &offset,
+                    const at::Tensor &mask,
+                    const int kernel_h,
+                    const int kernel_w,
+                    const int stride_h,
+                    const int stride_w,
+                    const int pad_h,
+                    const int pad_w,
+                    const int dilation_h,
+                    const int dilation_w,
+                    const int deformable_group);
+
+std::vector<at::Tensor>
+dcn_v2_cpu_backward(const at::Tensor &input,
+                     const at::Tensor &weight,
+                     const at::Tensor &bias,
+                     const at::Tensor &offset,
+                     const at::Tensor &mask,
+                     const at::Tensor &grad_output,
+                     int kernel_h, int kernel_w,
+                     int stride_h, int stride_w,
+                     int pad_h, int pad_w,
+                     int dilation_h, int dilation_w,
+                     int deformable_group);
+
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input,
+                                  const at::Tensor &bbox,
+                                  const at::Tensor &trans,
+                                  const int no_trans,
+                                  const float spatial_scale,
+                                  const int output_dim,
+                                  const int group_size,
+                                  const int pooled_size,
+                                  const int part_size,
+                                  const int sample_per_part,
+                                  const float trans_std);
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad,
+                                   const at::Tensor &input,
+                                   const at::Tensor &bbox,
+                                   const at::Tensor &trans,
+                                   const at::Tensor &top_count,
+                                   const int no_trans,
+                                   const float spatial_scale,
+                                   const int output_dim,
+                                   const int group_size,
+                                   const int pooled_size,
+                                   const int part_size,
+                                   const int sample_per_part,
+                                   const float trans_std);
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.cu b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.cu
new file mode 100644
index 00000000..cef3068f
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.cu
@@ -0,0 +1,372 @@
+#include <vector>
+#include "cuda/dcn_v2_im2col_cuda.h"
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDABlas.h>
+#include <ATen/Dispatch.h>
+#include <ATen/div_rtn.h>
+#include <THC/THC.h>
+#include <THC/THCAtomics.cuh>
+#include <THC/THCDeviceUtils.cuh>
+#include <ATen/cuda/CUDABlas.h>
+#include <ATen/cuda/Exceptions.h>
+
+THCState *state = at::globalContext().lazyInitCUDA();
+
+static cublasOperation_t _cublasOpFromChar(char op) {
+    switch (op) {
+      case 'n':
+      case 'N':
+        return CUBLAS_OP_N;
+      case 't':
+      case 'T':
+        return CUBLAS_OP_T;
+      case 'c':
+      case 'C':
+        return CUBLAS_OP_C;
+    }
+    AT_ERROR(
+        "_cublasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`");
+  }
+
+  static void _cublasAdjustLdLevel2(int64_t m, int64_t n, int64_t* lda) {
+    // Note: leading dimensions generally are checked that they are > 0
+    // and at least as big the result requires (even if the value won't
+    // be used).
+  
+    // Q: Why does Level3 check trans but this doesn't?
+    // A: In level 2, the sizes (m, n) specify the size of A
+    // (independent of trans value). In level 3. the sizes (m, n, k)
+    // specify the sizes of op(A), op(B) where op depend on trans
+    // values.
+    if (n <= 1)
+      *lda = std::max<int64_t>(m, 1);
+  }
+
+
+
+// author: Charles Shang
+// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
+
+// [batch gemm]
+// https://github.com/pytorch/pytorch/blob/master/aten/src/THC/generic/THCTensorMathBlas.cu
+
+__global__ void createBatchGemmBuffer(const float **input_b, float **output_b,
+                                      float **columns_b, const float **ones_b,
+                                      const float **weight_b, const float **bias_b,
+                                      float *input, float *output,
+                                      float *columns, float *ones,
+                                      float *weight, float *bias,
+                                      const int input_stride, const int output_stride,
+                                      const int columns_stride, const int ones_stride,
+                                      const int num_batches)
+{
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < num_batches)
+    {
+        input_b[idx] = input + idx * input_stride;
+        output_b[idx] = output + idx * output_stride;
+        columns_b[idx] = columns + idx * columns_stride;
+        ones_b[idx] = ones + idx * ones_stride;
+        // share weights and bias within a Mini-Batch
+        weight_b[idx] = weight;
+        bias_b[idx] = bias;
+    }
+}
+
+at::Tensor
+dcn_v2_cuda_forward(const at::Tensor &input,
+                    const at::Tensor &weight,
+                    const at::Tensor &bias,
+                    const at::Tensor &offset,
+                    const at::Tensor &mask,
+                    const int kernel_h,
+                    const int kernel_w,
+                    const int stride_h,
+                    const int stride_w,
+                    const int pad_h,
+                    const int pad_w,
+                    const int dilation_h,
+                    const int dilation_w,
+                    const int deformable_group)
+{
+    using scalar_t = float;
+    // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask));
+    AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+    AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
+    AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
+    AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
+    AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
+
+    const int batch = input.size(0);
+    const int channels = input.size(1);
+    const int height = input.size(2);
+    const int width = input.size(3);
+
+    const int channels_out = weight.size(0);
+    const int channels_kernel = weight.size(1);
+    const int kernel_h_ = weight.size(2);
+    const int kernel_w_ = weight.size(3);
+
+    // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h);
+    // printf("Channels: %d %d\n", channels, channels_kernel);
+    // printf("Channels: %d %d\n", channels_out, channels_kernel);
+
+    AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
+               "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
+
+    AT_ASSERTM(channels == channels_kernel,
+               "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
+
+    const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+    const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+    auto ones = at::ones({batch, height_out, width_out}, input.options());
+    auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
+    auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
+
+    // prepare for batch-wise computing, which is significantly faster than instance-wise computing
+    // when batch size is large.
+    // launch batch threads
+    int matrices_size = batch * sizeof(float *);
+    auto input_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
+    auto output_b = static_cast<float **>(THCudaMalloc(state, matrices_size));
+    auto columns_b = static_cast<float **>(THCudaMalloc(state, matrices_size));
+    auto ones_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
+    auto weight_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
+    auto bias_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
+
+    const int block = 128;
+    const int grid = (batch + block - 1) / block;
+
+    createBatchGemmBuffer<<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(
+        input_b, output_b,
+        columns_b, ones_b,
+        weight_b, bias_b,
+        input.data_ptr<scalar_t>(),
+        output.data_ptr<scalar_t>(),
+        columns.data_ptr<scalar_t>(),
+        ones.data_ptr<scalar_t>(),
+        weight.data_ptr<scalar_t>(),
+        bias.data_ptr<scalar_t>(),
+        channels * width * height,
+        channels_out * width_out * height_out,
+        channels * kernel_h * kernel_w * height_out * width_out,
+        height_out * width_out,
+        batch);
+
+    long m_ = channels_out;
+    long n_ = height_out * width_out;
+    long k_ = 1;
+    THCudaBlas_SgemmBatched(state,
+                            't',
+                            'n',
+                            n_,
+                            m_,
+                            k_,
+                            1.0f,
+                            ones_b, k_,
+                            bias_b, k_,
+                            0.0f,
+                            output_b, n_,
+                            batch);
+
+    modulated_deformable_im2col_cuda(c10::cuda::getCurrentCUDAStream(),
+                                     input.data_ptr<scalar_t>(),
+                                     offset.data_ptr<scalar_t>(),
+                                     mask.data_ptr<scalar_t>(),
+                                     batch, channels, height, width,
+                                     height_out, width_out, kernel_h, kernel_w,
+                                     pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                                     deformable_group,
+                                     columns.data_ptr<scalar_t>());
+
+    long m = channels_out;
+    long n = height_out * width_out;
+    long k = channels * kernel_h * kernel_w;
+    THCudaBlas_SgemmBatched(state,
+                            'n',
+                            'n',
+                            n,
+                            m,
+                            k,
+                            1.0f,
+                            (const float **)columns_b, n,
+                            weight_b, k,
+                            1.0f,
+                            output_b, n,
+                            batch);
+
+    THCudaFree(state, input_b);
+    THCudaFree(state, output_b);
+    THCudaFree(state, columns_b);
+    THCudaFree(state, ones_b);
+    THCudaFree(state, weight_b);
+    THCudaFree(state, bias_b);
+    return output;
+}
+
+__global__ void createBatchGemmBufferBackward(
+    float **grad_output_b,
+    float **columns_b,
+    float **ones_b,
+    float **weight_b,
+    float **grad_weight_b,
+    float **grad_bias_b,
+    float *grad_output,
+    float *columns,
+    float *ones,
+    float *weight,
+    float *grad_weight,
+    float *grad_bias,
+    const int grad_output_stride,
+    const int columns_stride,
+    const int ones_stride,
+    const int num_batches)
+{
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < num_batches)
+    {
+        grad_output_b[idx] = grad_output + idx * grad_output_stride;
+        columns_b[idx] = columns + idx * columns_stride;
+        ones_b[idx] = ones + idx * ones_stride;
+
+        // share weights and bias within a Mini-Batch
+        weight_b[idx] = weight;
+        grad_weight_b[idx] = grad_weight;
+        grad_bias_b[idx] = grad_bias;
+    }
+}
+
+std::vector<at::Tensor> dcn_v2_cuda_backward(const at::Tensor &input,
+                                             const at::Tensor &weight,
+                                             const at::Tensor &bias,
+                                             const at::Tensor &offset,
+                                             const at::Tensor &mask,
+                                             const at::Tensor &grad_output,
+                                             int kernel_h, int kernel_w,
+                                             int stride_h, int stride_w,
+                                             int pad_h, int pad_w,
+                                             int dilation_h, int dilation_w,
+                                             int deformable_group)
+{
+
+    THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous");
+    THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous");
+
+    AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+    AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
+    AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
+    AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
+    AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
+
+    const int batch = input.size(0);
+    const int channels = input.size(1);
+    const int height = input.size(2);
+    const int width = input.size(3);
+
+    const int channels_out = weight.size(0);
+    const int channels_kernel = weight.size(1);
+    const int kernel_h_ = weight.size(2);
+    const int kernel_w_ = weight.size(3);
+
+    AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
+               "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
+
+    AT_ASSERTM(channels == channels_kernel,
+               "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
+
+    const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+    const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+    auto ones = at::ones({height_out, width_out}, input.options());
+    auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
+    auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
+
+    auto grad_input = at::zeros_like(input);
+    auto grad_weight = at::zeros_like(weight);
+    auto grad_bias = at::zeros_like(bias);
+    auto grad_offset = at::zeros_like(offset);
+    auto grad_mask = at::zeros_like(mask);
+
+    using scalar_t = float;
+
+    for (int b = 0; b < batch; b++)
+    {
+        auto input_n = input.select(0, b);
+        auto offset_n = offset.select(0, b);
+        auto mask_n = mask.select(0, b);
+        auto grad_output_n = grad_output.select(0, b);
+        auto grad_input_n = grad_input.select(0, b);
+        auto grad_offset_n = grad_offset.select(0, b);
+        auto grad_mask_n = grad_mask.select(0, b);
+
+        long m = channels * kernel_h * kernel_w;
+        long n = height_out * width_out;
+        long k = channels_out;
+
+        THCudaBlas_Sgemm(state, 'n', 't', n, m, k, 1.0f,
+                         grad_output_n.data_ptr<scalar_t>(), n,
+                         weight.data_ptr<scalar_t>(), m, 0.0f,
+                         columns.data_ptr<scalar_t>(), n);
+
+        // gradient w.r.t. input coordinate data
+        modulated_deformable_col2im_coord_cuda(c10::cuda::getCurrentCUDAStream(),
+                                               columns.data_ptr<scalar_t>(),
+                                               input_n.data_ptr<scalar_t>(),
+                                               offset_n.data_ptr<scalar_t>(),
+                                               mask_n.data_ptr<scalar_t>(),
+                                               1, channels, height, width,
+                                               height_out, width_out, kernel_h, kernel_w,
+                                               pad_h, pad_w, stride_h, stride_w,
+                                               dilation_h, dilation_w, deformable_group,
+                                               grad_offset_n.data_ptr<scalar_t>(),
+                                               grad_mask_n.data_ptr<scalar_t>());
+        // gradient w.r.t. input data
+        modulated_deformable_col2im_cuda(c10::cuda::getCurrentCUDAStream(),
+                                         columns.data_ptr<scalar_t>(),
+                                         offset_n.data_ptr<scalar_t>(),
+                                         mask_n.data_ptr<scalar_t>(),
+                                         1, channels, height, width,
+                                         height_out, width_out, kernel_h, kernel_w,
+                                         pad_h, pad_w, stride_h, stride_w,
+                                         dilation_h, dilation_w, deformable_group,
+                                         grad_input_n.data_ptr<scalar_t>());
+
+        // gradient w.r.t. weight, dWeight should accumulate across the batch and group
+        modulated_deformable_im2col_cuda(c10::cuda::getCurrentCUDAStream(),
+                                         input_n.data_ptr<scalar_t>(),
+                                         offset_n.data_ptr<scalar_t>(),
+                                         mask_n.data_ptr<scalar_t>(),
+                                         1, channels, height, width,
+                                         height_out, width_out, kernel_h, kernel_w,
+                                         pad_h, pad_w, stride_h, stride_w,
+                                         dilation_h, dilation_w, deformable_group,
+                                         columns.data_ptr<scalar_t>());
+
+        long m_ = channels_out;
+        long n_ = channels * kernel_h * kernel_w;
+        long k_ = height_out * width_out;
+
+        THCudaBlas_Sgemm(state, 't', 'n', n_, m_, k_, 1.0f,
+                         columns.data_ptr<scalar_t>(), k_,
+                         grad_output_n.data_ptr<scalar_t>(), k_, 1.0f,
+                         grad_weight.data_ptr<scalar_t>(), n_);
+
+        cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
+        cublasOperation_t op = _cublasOpFromChar('t');
+        _cublasAdjustLdLevel2(k_, m_, &k_);
+        scalar_t* grad_output_n_float = grad_output_n.data_ptr<scalar_t>();
+        scalar_t* one_float = ones.data_ptr<scalar_t>();
+        scalar_t alpha = 1.0;
+        scalar_t beta = 1.0;
+        cublasSgemv(handle, op, k_, m_, &alpha, grad_output_n_float,k_, one_float,1, &beta, grad_bias.data_ptr<scalar_t>(), 1);
+
+    }
+    
+
+    return {
+        grad_input, grad_offset, grad_mask, grad_weight, grad_bias
+    };
+}
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.cu b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.cu
new file mode 100644
index 00000000..4140eacf
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.cu
@@ -0,0 +1,402 @@
+#include "dcn_v2_im2col_cuda.h"
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THC.h>
+#include <THC/THCAtomics.cuh>
+#include <THC/THCDeviceUtils.cuh>
+
+#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N)
+{
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+
+__device__ float dmcn_im2col_bilinear_cuda(const float *bottom_data, const int data_width,
+                                      const int height, const int width, float h, float w)
+{
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  float lh = h - h_low;
+  float lw = w - w_low;
+  float hh = 1 - lh, hw = 1 - lw;
+
+  float v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  float v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  float v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  float v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+__device__ float dmcn_get_gradient_weight_cuda(float argmax_h, float argmax_w,
+                                          const int h, const int w, const int height, const int width)
+{
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  float weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+__device__ float dmcn_get_coordinate_weight_cuda(float argmax_h, float argmax_w,
+                                            const int height, const int width, const float *im_data,
+                                            const int data_width, const int bp_dir)
+{
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  float weight = 0;
+
+  if (bp_dir == 0)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+  else if (bp_dir == 1)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+__global__ void modulated_deformable_im2col_gpu_kernel(const int n,
+                                                       const float *data_im, const float *data_offset, const float *data_mask,
+                                                       const int height, const int width, const int kernel_h, const int kernel_w,
+                                                       const int pad_h, const int pad_w,
+                                                       const int stride_h, const int stride_w,
+                                                       const int dilation_h, const int dilation_w,
+                                                       const int channel_per_deformable_group,
+                                                       const int batch_size, const int num_channels, const int deformable_group,
+                                                       const int height_col, const int width_col,
+                                                       float *data_col)
+{
+  // launch channels * batch_size * height_col * width_col cores
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow)
+    // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis
+
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    // const int b_col = (index / width_col / height_col) % batch_size;
+    const int b_col = (index / width_col / height_col / num_channels) % batch_size;
+    // const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_im = (index / width_col / height_col) % num_channels;
+    // const int c_col = c_im * kernel_h * kernel_w;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    //  float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col;
+    //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
+    const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
+    const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+
+    const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i)
+    {
+      for (int j = 0; j < kernel_w; ++j)
+      {
+        const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+        const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const float offset_h = data_offset_ptr[data_offset_h_ptr];
+        const float offset_w = data_offset_ptr[data_offset_w_ptr];
+        const float mask = data_mask_ptr[data_mask_hw_ptr];
+        float val = static_cast<float>(0);
+        const float h_im = h_in + i * dilation_h + offset_h;
+        const float w_im = w_in + j * dilation_w + offset_w;
+        //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+        {
+          //const float map_h = i * dilation_h + offset_h;
+          //const float map_w = j * dilation_w + offset_w;
+          //const int cur_height = height - h_in;
+          //const int cur_width = width - w_in;
+          //val = dmcn_im2col_bilinear_cuda(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
+          val = dmcn_im2col_bilinear_cuda(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        // data_col_ptr += batch_size * height_col * width_col;
+        data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+__global__ void modulated_deformable_col2im_gpu_kernel(const int n,
+                                                       const float *data_col, const float *data_offset, const float *data_mask,
+                                                       const int channels, const int height, const int width,
+                                                       const int kernel_h, const int kernel_w,
+                                                       const int pad_h, const int pad_w,
+                                                       const int stride_h, const int stride_w,
+                                                       const int dilation_h, const int dilation_w,
+                                                       const int channel_per_deformable_group,
+                                                       const int batch_size, const int deformable_group,
+                                                       const int height_col, const int width_col,
+                                                       float *grad_im)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+    const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const float offset_h = data_offset_ptr[data_offset_h_ptr];
+    const float offset_w = data_offset_ptr[data_offset_w_ptr];
+    const float mask = data_mask_ptr[data_mask_hw_ptr];
+    const float cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const float cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const float cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++)
+    {
+      for (int dx = -2; dx <= 2; dx++)
+      {
+        if (cur_h + dy >= 0 && cur_h + dy < height &&
+            cur_w + dx >= 0 && cur_w + dx < width &&
+            abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1)
+        {
+          int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          float weight = dmcn_get_gradient_weight_cuda(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n,
+                                                             const float *data_col, const float *data_im,
+                                                             const float *data_offset, const float *data_mask,
+                                                             const int channels, const int height, const int width,
+                                                             const int kernel_h, const int kernel_w,
+                                                             const int pad_h, const int pad_w,
+                                                             const int stride_h, const int stride_w,
+                                                             const int dilation_h, const int dilation_w,
+                                                             const int channel_per_deformable_group,
+                                                             const int batch_size, const int offset_channels, const int deformable_group,
+                                                             const int height_col, const int width_col,
+                                                             float *grad_offset, float *grad_mask)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    float val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
+    const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+    const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
+    {
+      const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
+      const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const float offset_h = data_offset_ptr[data_offset_h_ptr];
+      const float offset_w = data_offset_ptr[data_offset_w_ptr];
+      const float mask = data_mask_ptr[data_mask_hw_ptr];
+      float inv_h = h_in + i * dilation_h + offset_h;
+      float inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+      {
+        inv_h = inv_w = -2;
+      }
+      else
+      {
+        mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear_cuda(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
+      }
+      const float weight = dmcn_get_coordinate_weight_cuda(
+          inv_h, inv_w,
+          height, width, data_im_ptr + cnt * height * width, width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
+  }
+}
+
+void modulated_deformable_im2col_cuda(cudaStream_t stream,
+  const float* data_im, const float* data_offset, const float* data_mask,
+  const int batch_size, const int channels, const int height_im, const int width_im, 
+  const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+  const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
+  const int dilation_h, const int dilation_w,
+  const int deformable_group, float* data_col) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+  modulated_deformable_im2col_gpu_kernel
+      <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
+          0, stream>>>(
+      num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w,
+      pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+      batch_size, channels, deformable_group, height_col, width_col, data_col);
+  
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
+
+void modulated_deformable_col2im_cuda(cudaStream_t stream,
+  const float* data_col, const float* data_offset, const float* data_mask,
+  const int batch_size, const int channels, const int height_im, const int width_im, 
+  const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+  const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
+  const int dilation_h, const int dilation_w, 
+  const int deformable_group, float* grad_im){
+
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+  modulated_deformable_col2im_gpu_kernel
+      <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
+          0, stream>>>(
+        num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im,
+        kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
+        dilation_h, dilation_w, channel_per_deformable_group,
+        batch_size, deformable_group, height_col, width_col, grad_im);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
+
+void modulated_deformable_col2im_coord_cuda(cudaStream_t stream,
+  const float* data_col, const float* data_im, const float* data_offset, const float* data_mask,
+  const int batch_size, const int channels, const int height_im, const int width_im, 
+  const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+  const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
+  const int dilation_h, const int dilation_w, 
+  const int deformable_group,
+  float* grad_offset, float* grad_mask) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
+  const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
+  modulated_deformable_col2im_coord_gpu_kernel
+      <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
+        0, stream>>>(
+        num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im,
+        kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, channel_per_deformable_group,
+        batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, 
+        grad_offset, grad_mask);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.h b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.h
new file mode 100644
index 00000000..c8568319
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.h
@@ -0,0 +1,101 @@
+
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.h
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1811.11168
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu
+ */
+
+/***************** Adapted by Charles Shang *********************/
+
+#ifndef DCN_V2_IM2COL_CUDA
+#define DCN_V2_IM2COL_CUDA
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  void modulated_deformable_im2col_cuda(cudaStream_t stream,
+                                        const float *data_im, const float *data_offset, const float *data_mask,
+                                        const int batch_size, const int channels, const int height_im, const int width_im,
+                                        const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+                                        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+                                        const int dilation_h, const int dilation_w,
+                                        const int deformable_group, float *data_col);
+
+  void modulated_deformable_col2im_cuda(cudaStream_t stream,
+                                        const float *data_col, const float *data_offset, const float *data_mask,
+                                        const int batch_size, const int channels, const int height_im, const int width_im,
+                                        const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+                                        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+                                        const int dilation_h, const int dilation_w,
+                                        const int deformable_group, float *grad_im);
+
+  void modulated_deformable_col2im_coord_cuda(cudaStream_t stream,
+                                         const float *data_col, const float *data_im, const float *data_offset, const float *data_mask,
+                                         const int batch_size, const int channels, const int height_im, const int width_im,
+                                         const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+                                         const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+                                         const int dilation_h, const int dilation_w,
+                                         const int deformable_group,
+                                         float *grad_offset, float *grad_mask);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.cu b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.cu
new file mode 100644
index 00000000..bf99f0ca
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.cu
@@ -0,0 +1,419 @@
+/*!
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file deformable_psroi_pooling.cu
+ * \brief
+ * \author Yi Li, Guodong Zhang, Jifeng Dai
+*/
+/***************** Adapted by Charles Shang *********************/
+
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THC.h>
+#include <THC/THCAtomics.cuh>
+#include <THC/THCDeviceUtils.cuh>
+
+#define CUDA_KERNEL_LOOP(i, n)                        \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+       i < (n);                                       \
+       i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N)
+{
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+template <typename T>
+__device__ T bilinear_interp_cuda(
+    const T *data,
+    const T x,
+    const T y,
+    const int width,
+    const int height)
+{
+  int x1 = floor(x);
+  int x2 = ceil(x);
+  int y1 = floor(y);
+  int y2 = ceil(y);
+  T dist_x = static_cast<T>(x - x1);
+  T dist_y = static_cast<T>(y - y1);
+  T value11 = data[y1 * width + x1];
+  T value12 = data[y2 * width + x1];
+  T value21 = data[y1 * width + x2];
+  T value22 = data[y2 * width + x2];
+  T value = (1 - dist_x) * (1 - dist_y) * value11 +
+            (1 - dist_x) * dist_y * value12 +
+            dist_x * (1 - dist_y) * value21 +
+            dist_x * dist_y * value22;
+  return value;
+}
+
+template <typename T>
+__global__ void DeformablePSROIPoolForwardKernelCuda(
+    const int count,
+    const T *bottom_data,
+    const T spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const T *bottom_rois, const T *bottom_trans,
+    const int no_trans,
+    const T trans_std,
+    const int sample_per_part,
+    const int output_dim,
+    const int group_size,
+    const int part_size,
+    const int num_classes,
+    const int channels_each_class,
+    T *top_data,
+    T *top_count)
+{
+  CUDA_KERNEL_LOOP(index, count)
+  {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const T *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+    // Force too small ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
+    T roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+    // Compute w and h at bottom
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
+    T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
+
+    int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
+    int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
+    T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
+
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    T sum = 0;
+    int count = 0;
+    int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
+    int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
+    gw = min(max(gw, 0), group_size - 1);
+    gh = min(max(gh, 0), group_size - 1);
+
+    const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
+    for (int ih = 0; ih < sample_per_part; ih++)
+    {
+      for (int iw = 0; iw < sample_per_part; iw++)
+      {
+        T w = wstart + iw * sub_bin_size_w;
+        T h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+        {
+          continue;
+        }
+        w = min(max(w, 0.), width - 1.);
+        h = min(max(h, 0.), height - 1.);
+        int c = (ctop * group_size + gh) * group_size + gw;
+        T val = bilinear_interp_cuda(offset_bottom_data + c * height * width, w, h, width, height);
+        sum += val;
+        count++;
+      }
+    }
+    top_data[index] = count == 0 ? static_cast<T>(0) : sum / count;
+    top_count[index] = count;
+  }
+}
+
+template <typename T>
+__global__ void DeformablePSROIPoolBackwardAccKernelCuda(
+    const int count,
+    const T *top_diff,
+    const T *top_count,
+    const int num_rois,
+    const T spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const int output_dim,
+    T *bottom_data_diff, T *bottom_trans_diff,
+    const T *bottom_data,
+    const T *bottom_rois,
+    const T *bottom_trans,
+    const int no_trans,
+    const T trans_std,
+    const int sample_per_part,
+    const int group_size,
+    const int part_size,
+    const int num_classes,
+    const int channels_each_class)
+{
+  CUDA_KERNEL_LOOP(index, count)
+  {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const T *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+    // Force too small ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
+    T roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+    // Compute w and h at bottom
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
+    T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
+
+    int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
+    int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
+    T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
+
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    if (top_count[index] <= 0)
+    {
+      continue;
+    }
+    T diff_val = top_diff[index] / top_count[index];
+    const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
+    T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
+    int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
+    int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
+    gw = min(max(gw, 0), group_size - 1);
+    gh = min(max(gh, 0), group_size - 1);
+
+    for (int ih = 0; ih < sample_per_part; ih++)
+    {
+      for (int iw = 0; iw < sample_per_part; iw++)
+      {
+        T w = wstart + iw * sub_bin_size_w;
+        T h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+        {
+          continue;
+        }
+        w = min(max(w, 0.), width - 1.);
+        h = min(max(h, 0.), height - 1.);
+        int c = (ctop * group_size + gh) * group_size + gw;
+        // backward on feature
+        int x0 = floor(w);
+        int x1 = ceil(w);
+        int y0 = floor(h);
+        int y1 = ceil(h);
+        T dist_x = w - x0, dist_y = h - y0;
+        T q00 = (1 - dist_x) * (1 - dist_y);
+        T q01 = (1 - dist_x) * dist_y;
+        T q10 = dist_x * (1 - dist_y);
+        T q11 = dist_x * dist_y;
+        int bottom_index_base = c * height * width;
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);
+
+        if (no_trans)
+        {
+          continue;
+        }
+        T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
+        T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
+        T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
+        T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
+        T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
+        diff_x *= roi_width;
+        T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
+        diff_y *= roi_height;
+
+        atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
+        atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);
+      }
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input,
+                                  const at::Tensor &bbox,
+                                  const at::Tensor &trans,
+                                  const int no_trans,
+                                  const float spatial_scale,
+                                  const int output_dim,
+                                  const int group_size,
+                                  const int pooled_size,
+                                  const int part_size,
+                                  const int sample_per_part,
+                                  const float trans_std)
+{
+  AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor");
+  AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+  const int num_bbox = bbox.size(0);
+
+  AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
+  auto pooled_height = pooled_size;
+  auto pooled_width = pooled_size;
+
+  auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
+  long out_size = num_bbox * output_dim * pooled_height * pooled_width;
+  auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
+
+  const int num_classes = no_trans ? 1 : channels_trans / 2;
+  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (out.numel() == 0)
+  {
+    THCudaCheck(cudaGetLastError());
+    return std::make_tuple(out, top_count);
+  }
+
+  dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
+  dim3 block(512);
+
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cuda_forward", [&] {
+    DeformablePSROIPoolForwardKernelCuda<scalar_t><<<grid, block, 0, stream>>>(
+        out_size,
+        input.contiguous().data_ptr<scalar_t>(),
+        spatial_scale,
+        channels,
+        height, width,
+        pooled_height,
+        pooled_width,
+        bbox.contiguous().data_ptr<scalar_t>(),
+        trans.contiguous().data_ptr<scalar_t>(),
+        no_trans,
+        trans_std,
+        sample_per_part,
+        output_dim,
+        group_size,
+        part_size,
+        num_classes,
+        channels_each_class,
+        out.data_ptr<scalar_t>(),
+        top_count.data_ptr<scalar_t>());
+  });
+  THCudaCheck(cudaGetLastError());
+  return std::make_tuple(out, top_count);
+}
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad,
+                                   const at::Tensor &input,
+                                   const at::Tensor &bbox,
+                                   const at::Tensor &trans,
+                                   const at::Tensor &top_count,
+                                   const int no_trans,
+                                   const float spatial_scale,
+                                   const int output_dim,
+                                   const int group_size,
+                                   const int pooled_size,
+                                   const int part_size,
+                                   const int sample_per_part,
+                                   const float trans_std)
+{
+  AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor");
+  AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor");
+  AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");
+  AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+  const int num_bbox = bbox.size(0);
+
+  AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
+  auto pooled_height = pooled_size;
+  auto pooled_width = pooled_size;
+  long out_size = num_bbox * output_dim * pooled_height * pooled_width;
+  const int num_classes = no_trans ? 1 : channels_trans / 2;
+  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+  auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options());
+  auto trans_grad = at::zeros_like(trans);
+
+  if (input_grad.numel() == 0)
+  {
+    THCudaCheck(cudaGetLastError());
+    return std::make_tuple(input_grad, trans_grad);
+  }
+
+  dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
+  dim3 block(512);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cuda_backward", [&] {
+    DeformablePSROIPoolBackwardAccKernelCuda<scalar_t><<<grid, block, 0, stream>>>(
+        out_size,
+        out_grad.contiguous().data_ptr<scalar_t>(),
+        top_count.contiguous().data_ptr<scalar_t>(),
+        num_bbox,
+        spatial_scale,
+        channels,
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        output_dim,
+        input_grad.contiguous().data_ptr<scalar_t>(),
+        trans_grad.contiguous().data_ptr<scalar_t>(),
+        input.contiguous().data_ptr<scalar_t>(),
+        bbox.contiguous().data_ptr<scalar_t>(),
+        trans.contiguous().data_ptr<scalar_t>(),
+        no_trans,
+        trans_std,
+        sample_per_part,
+        group_size,
+        part_size,
+        num_classes,
+        channels_each_class);
+  });
+  THCudaCheck(cudaGetLastError());
+  return std::make_tuple(input_grad, trans_grad);
+}
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/vision.h b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/vision.h
new file mode 100644
index 00000000..f3672b11
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/cuda/vision.h
@@ -0,0 +1,60 @@
+#pragma once
+#include <torch/extension.h>
+#include <ATen/div_rtn.h>
+at::Tensor
+dcn_v2_cuda_forward(const at::Tensor &input,
+                    const at::Tensor &weight,
+                    const at::Tensor &bias,
+                    const at::Tensor &offset,
+                    const at::Tensor &mask,
+                    const int kernel_h,
+                    const int kernel_w,
+                    const int stride_h,
+                    const int stride_w,
+                    const int pad_h,
+                    const int pad_w,
+                    const int dilation_h,
+                    const int dilation_w,
+                    const int deformable_group);
+
+std::vector<at::Tensor>
+dcn_v2_cuda_backward(const at::Tensor &input,
+                     const at::Tensor &weight,
+                     const at::Tensor &bias,
+                     const at::Tensor &offset,
+                     const at::Tensor &mask,
+                     const at::Tensor &grad_output,
+                     int kernel_h, int kernel_w,
+                     int stride_h, int stride_w,
+                     int pad_h, int pad_w,
+                     int dilation_h, int dilation_w,
+                     int deformable_group);
+
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input,
+                                  const at::Tensor &bbox,
+                                  const at::Tensor &trans,
+                                  const int no_trans,
+                                  const float spatial_scale,
+                                  const int output_dim,
+                                  const int group_size,
+                                  const int pooled_size,
+                                  const int part_size,
+                                  const int sample_per_part,
+                                  const float trans_std);
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad,
+                                   const at::Tensor &input,
+                                   const at::Tensor &bbox,
+                                   const at::Tensor &trans,
+                                   const at::Tensor &top_count,
+                                   const int no_trans,
+                                   const float spatial_scale,
+                                   const int output_dim,
+                                   const int group_size,
+                                   const int pooled_size,
+                                   const int part_size,
+                                   const int sample_per_part,
+                                   const float trans_std);
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/dcn_v2.h b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/dcn_v2.h
new file mode 100644
index 00000000..de670bf9
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/dcn_v2.h
@@ -0,0 +1,190 @@
+#pragma once
+
+#include "cpu/vision.h"
+
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+#endif
+
+at::Tensor
+dcn_v2_forward(const at::Tensor &input,
+               const at::Tensor &weight,
+               const at::Tensor &bias,
+               const at::Tensor &offset,
+               const at::Tensor &mask,
+               const int kernel_h,
+               const int kernel_w,
+               const int stride_h,
+               const int stride_w,
+               const int pad_h,
+               const int pad_w,
+               const int dilation_h,
+               const int dilation_w,
+               const int deformable_group)
+{
+    if (input.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return dcn_v2_cuda_forward(input, weight, bias, offset, mask,
+                                   kernel_h, kernel_w,
+                                   stride_h, stride_w,
+                                   pad_h, pad_w,
+                                   dilation_h, dilation_w,
+                                   deformable_group);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    else{
+        return dcn_v2_cpu_forward(input, weight, bias, offset, mask,
+                                   kernel_h, kernel_w,
+                                   stride_h, stride_w,
+                                   pad_h, pad_w,
+                                   dilation_h, dilation_w,
+                                   deformable_group);
+    }
+}
+
+std::vector<at::Tensor>
+dcn_v2_backward(const at::Tensor &input,
+                const at::Tensor &weight,
+                const at::Tensor &bias,
+                const at::Tensor &offset,
+                const at::Tensor &mask,
+                const at::Tensor &grad_output,
+                int kernel_h, int kernel_w,
+                int stride_h, int stride_w,
+                int pad_h, int pad_w,
+                int dilation_h, int dilation_w,
+                int deformable_group)
+{
+    if (input.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return dcn_v2_cuda_backward(input,
+                                    weight,
+                                    bias,
+                                    offset,
+                                    mask,
+                                    grad_output,
+                                    kernel_h, kernel_w,
+                                    stride_h, stride_w,
+                                    pad_h, pad_w,
+                                    dilation_h, dilation_w,
+                                    deformable_group);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    else{
+        return dcn_v2_cpu_backward(input,
+                                    weight,
+                                    bias,
+                                    offset,
+                                    mask,
+                                    grad_output,
+                                    kernel_h, kernel_w,
+                                    stride_h, stride_w,
+                                    pad_h, pad_w,
+                                    dilation_h, dilation_w,
+                                    deformable_group);
+    }
+}
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_forward(const at::Tensor &input,
+                             const at::Tensor &bbox,
+                             const at::Tensor &trans,
+                             const int no_trans,
+                             const float spatial_scale,
+                             const int output_dim,
+                             const int group_size,
+                             const int pooled_size,
+                             const int part_size,
+                             const int sample_per_part,
+                             const float trans_std)
+{
+    if (input.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return dcn_v2_psroi_pooling_cuda_forward(input,
+                                                 bbox,
+                                                 trans,
+                                                 no_trans,
+                                                 spatial_scale,
+                                                 output_dim,
+                                                 group_size,
+                                                 pooled_size,
+                                                 part_size,
+                                                 sample_per_part,
+                                                 trans_std);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    else{
+        return dcn_v2_psroi_pooling_cpu_forward(input,
+                                                 bbox,
+                                                 trans,
+                                                 no_trans,
+                                                 spatial_scale,
+                                                 output_dim,
+                                                 group_size,
+                                                 pooled_size,
+                                                 part_size,
+                                                 sample_per_part,
+                                                 trans_std);
+    }
+}
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_backward(const at::Tensor &out_grad,
+                              const at::Tensor &input,
+                              const at::Tensor &bbox,
+                              const at::Tensor &trans,
+                              const at::Tensor &top_count,
+                              const int no_trans,
+                              const float spatial_scale,
+                              const int output_dim,
+                              const int group_size,
+                              const int pooled_size,
+                              const int part_size,
+                              const int sample_per_part,
+                              const float trans_std)
+{
+    if (input.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return dcn_v2_psroi_pooling_cuda_backward(out_grad,
+                                                  input,
+                                                  bbox,
+                                                  trans,
+                                                  top_count,
+                                                  no_trans,
+                                                  spatial_scale,
+                                                  output_dim,
+                                                  group_size,
+                                                  pooled_size,
+                                                  part_size,
+                                                  sample_per_part,
+                                                  trans_std);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    else{
+        return dcn_v2_psroi_pooling_cpu_backward(out_grad,
+                                                  input,
+                                                  bbox,
+                                                  trans,
+                                                  top_count,
+                                                  no_trans,
+                                                  spatial_scale,
+                                                  output_dim,
+                                                  group_size,
+                                                  pooled_size,
+                                                  part_size,
+                                                  sample_per_part,
+                                                  trans_std);
+    }
+}
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/vision.cpp b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/vision.cpp
new file mode 100644
index 00000000..ff54233e
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/src/vision.cpp
@@ -0,0 +1,9 @@
+
+#include "dcn_v2.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("dcn_v2_forward", &dcn_v2_forward, "dcn_v2_forward");
+  m.def("dcn_v2_backward", &dcn_v2_backward, "dcn_v2_backward");
+  m.def("dcn_v2_psroi_pooling_forward", &dcn_v2_psroi_pooling_forward, "dcn_v2_psroi_pooling_forward");
+  m.def("dcn_v2_psroi_pooling_backward", &dcn_v2_psroi_pooling_backward, "dcn_v2_psroi_pooling_backward");
+}
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/testcpu.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/testcpu.py
new file mode 100644
index 00000000..0a0b36eb
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/testcpu.py
@@ -0,0 +1,270 @@
+#!/usr/bin/env python
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import time
+import torch
+import torch.nn as nn
+from torch.autograd import gradcheck
+
+from dcn_v2 import dcn_v2_conv, DCNv2, DCN
+from dcn_v2 import dcn_v2_pooling, DCNv2Pooling, DCNPooling
+
+deformable_groups = 1
+N, inC, inH, inW = 2, 2, 4, 4
+outC = 2
+kH, kW = 3, 3
+
+
+def conv_identify(weight, bias):
+    weight.data.zero_()
+    bias.data.zero_()
+    o, i, h, w = weight.shape
+    y = h//2
+    x = w//2
+    for p in range(i):
+        for q in range(o):
+            if p == q:
+                weight.data[q, p, y, x] = 1.0
+
+
+def check_zero_offset():
+    conv_offset = nn.Conv2d(inC, deformable_groups * 2 * kH * kW,
+                            kernel_size=(kH, kW),
+                            stride=(1, 1),
+                            padding=(1, 1),
+                            bias=True)
+
+    conv_mask = nn.Conv2d(inC, deformable_groups * 1 * kH * kW,
+                          kernel_size=(kH, kW),
+                          stride=(1, 1),
+                          padding=(1, 1),
+                          bias=True)
+
+    dcn_v2 = DCNv2(inC, outC, (kH, kW),
+                   stride=1, padding=1, dilation=1,
+                   deformable_groups=deformable_groups)
+
+    conv_offset.weight.data.zero_()
+    conv_offset.bias.data.zero_()
+    conv_mask.weight.data.zero_()
+    conv_mask.bias.data.zero_()
+    conv_identify(dcn_v2.weight, dcn_v2.bias)
+
+    input = torch.randn(N, inC, inH, inW)
+    offset = conv_offset(input)
+    mask = conv_mask(input)
+    mask = torch.sigmoid(mask)
+    output = dcn_v2(input, offset, mask)
+    output *= 2
+    d = (input - output).abs().max()
+    if d < 1e-10:
+        print('Zero offset passed')
+    else:
+        print('Zero offset failed')
+        print(input)
+        print(output)
+
+def check_gradient_dconv():
+
+    input = torch.rand(N, inC, inH, inW) * 0.01
+    input.requires_grad = True
+
+    offset = torch.randn(N, deformable_groups * 2 * kW * kH, inH, inW) * 2
+    # offset.data.zero_()
+    # offset.data -= 0.5
+    offset.requires_grad = True
+
+    mask = torch.rand(N, deformable_groups * 1 * kW * kH, inH, inW)
+    # mask.data.zero_()
+    mask.requires_grad = True
+    mask = torch.sigmoid(mask)
+
+    weight = torch.randn(outC, inC, kH, kW)
+    weight.requires_grad = True
+
+    bias = torch.rand(outC)
+    bias.requires_grad = True
+
+    stride = 1
+    padding = 1
+    dilation = 1
+
+    print('check_gradient_dconv: ',
+          gradcheck(dcn_v2_conv, (input, offset, mask, weight, bias,
+                    stride, padding, dilation, deformable_groups),
+                    eps=1e-3, atol=1e-4, rtol=1e-2))
+
+
+def check_pooling_zero_offset():
+
+    input = torch.randn(2, 16, 64, 64).zero_()
+    input[0, :, 16:26, 16:26] = 1.
+    input[1, :, 10:20, 20:30] = 2.
+    rois = torch.tensor([
+        [0, 65, 65, 103, 103],
+        [1, 81, 41, 119, 79],
+    ]).float()
+    pooling = DCNv2Pooling(spatial_scale=1.0 / 4,
+                           pooled_size=7,
+                           output_dim=16,
+                           no_trans=True,
+                           group_size=1,
+                           trans_std=0.0)
+
+    out = pooling(input, rois, input.new())
+    s = ', '.join(['%f' % out[i, :, :, :].mean().item()
+                   for i in range(rois.shape[0])])
+    print(s)
+
+    dpooling = DCNv2Pooling(spatial_scale=1.0 / 4,
+                            pooled_size=7,
+                            output_dim=16,
+                            no_trans=False,
+                            group_size=1,
+                            trans_std=0.0)
+    offset = torch.randn(20, 2, 7, 7).zero_()
+    dout = dpooling(input, rois, offset)
+    s = ', '.join(['%f' % dout[i, :, :, :].mean().item()
+                   for i in range(rois.shape[0])])
+    print(s)
+
+
+def check_gradient_dpooling():
+    input = torch.randn(2, 3, 5, 5) * 0.01
+    N = 4
+    batch_inds = torch.randint(2, (N, 1)).float()
+    x = torch.rand((N, 1)).float() * 15
+    y = torch.rand((N, 1)).float() * 15
+    w = torch.rand((N, 1)).float() * 10
+    h = torch.rand((N, 1)).float() * 10
+    rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
+    offset = torch.randn(N, 2, 3, 3)
+    input.requires_grad = True
+    offset.requires_grad = True
+
+    spatial_scale = 1.0 / 4
+    pooled_size = 3
+    output_dim = 3
+    no_trans = 0
+    group_size = 1
+    trans_std = 0.0
+    sample_per_part = 4
+    part_size = pooled_size
+
+    print('check_gradient_dpooling:',
+          gradcheck(dcn_v2_pooling, (input, rois, offset,
+                                     spatial_scale,
+                                     pooled_size,
+                                     output_dim,
+                                     no_trans,
+                                     group_size,
+                                     part_size,
+                                     sample_per_part,
+                                     trans_std),
+                    eps=1e-4))
+
+
+def example_dconv():
+    input = torch.randn(2, 64, 128, 128)
+    # wrap all things (offset and mask) in DCN
+    dcn = DCN(64, 64, kernel_size=(3, 3), stride=1,
+              padding=1, deformable_groups=2)
+    # print(dcn.weight.shape, input.shape)
+    output = dcn(input)
+    targert = output.new(*output.size())
+    targert.data.uniform_(-0.01, 0.01)
+    error = (targert - output).mean()
+    error.backward()
+    print(output.shape)
+
+
+def example_dpooling():
+    input = torch.randn(2, 32, 64, 64)
+    batch_inds = torch.randint(2, (20, 1)).float()
+    x = torch.randint(256, (20, 1)).float()
+    y = torch.randint(256, (20, 1)).float()
+    w = torch.randint(64, (20, 1)).float()
+    h = torch.randint(64, (20, 1)).float()
+    rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
+    offset = torch.randn(20, 2, 7, 7)
+    input.requires_grad = True
+    offset.requires_grad = True
+
+    # normal roi_align
+    pooling = DCNv2Pooling(spatial_scale=1.0 / 4,
+                           pooled_size=7,
+                           output_dim=32,
+                           no_trans=True,
+                           group_size=1,
+                           trans_std=0.1)
+
+    # deformable pooling
+    dpooling = DCNv2Pooling(spatial_scale=1.0 / 4,
+                            pooled_size=7,
+                            output_dim=32,
+                            no_trans=False,
+                            group_size=1,
+                            trans_std=0.1)
+
+    out = pooling(input, rois, offset)
+    dout = dpooling(input, rois, offset)
+    print(out.shape)
+    print(dout.shape)
+
+    target_out = out.new(*out.size())
+    target_out.data.uniform_(-0.01, 0.01)
+    target_dout = dout.new(*dout.size())
+    target_dout.data.uniform_(-0.01, 0.01)
+    e = (target_out - out).mean()
+    e.backward()
+    e = (target_dout - dout).mean()
+    e.backward()
+
+
+def example_mdpooling():
+    input = torch.randn(2, 32, 64, 64)
+    input.requires_grad = True
+    batch_inds = torch.randint(2, (20, 1)).float()
+    x = torch.randint(256, (20, 1)).float()
+    y = torch.randint(256, (20, 1)).float()
+    w = torch.randint(64, (20, 1)).float()
+    h = torch.randint(64, (20, 1)).float()
+    rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
+
+    # mdformable pooling (V2)
+    dpooling = DCNPooling(spatial_scale=1.0 / 4,
+                          pooled_size=7,
+                          output_dim=32,
+                          no_trans=False,
+                          group_size=1,
+                          trans_std=0.1,
+                          deform_fc_dim=1024)
+
+    dout = dpooling(input, rois)
+    target = dout.new(*dout.size())
+    target.data.uniform_(-0.1, 0.1)
+    error = (target - dout).mean()
+    error.backward()
+    print(dout.shape)
+
+
+if __name__ == '__main__':
+
+    example_dconv()
+    example_dpooling()
+    example_mdpooling()
+
+    check_pooling_zero_offset()
+    # zero offset check
+    if inC == outC:
+        check_zero_offset()
+
+    check_gradient_dpooling()
+    check_gradient_dconv()
+    # """
+    # ****** Note: backward is not reentrant error may not be a serious problem,
+    # ****** since the max error is less than 1e-7,
+    # ****** Still looking for what trigger this problem
+    # """
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/testcuda.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/testcuda.py
new file mode 100644
index 00000000..3bd5bd22
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2/testcuda.py
@@ -0,0 +1,270 @@
+#!/usr/bin/env python
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import time
+import torch
+import torch.nn as nn
+from torch.autograd import gradcheck
+
+from dcn_v2 import dcn_v2_conv, DCNv2, DCN
+from dcn_v2 import dcn_v2_pooling, DCNv2Pooling, DCNPooling
+
+deformable_groups = 1
+N, inC, inH, inW = 2, 2, 4, 4
+outC = 2
+kH, kW = 3, 3
+
+
+def conv_identify(weight, bias):
+    weight.data.zero_()
+    bias.data.zero_()
+    o, i, h, w = weight.shape
+    y = h//2
+    x = w//2
+    for p in range(i):
+        for q in range(o):
+            if p == q:
+                weight.data[q, p, y, x] = 1.0
+
+
+def check_zero_offset():
+    conv_offset = nn.Conv2d(inC, deformable_groups * 2 * kH * kW,
+                            kernel_size=(kH, kW),
+                            stride=(1, 1),
+                            padding=(1, 1),
+                            bias=True).cuda()
+
+    conv_mask = nn.Conv2d(inC, deformable_groups * 1 * kH * kW,
+                          kernel_size=(kH, kW),
+                          stride=(1, 1),
+                          padding=(1, 1),
+                          bias=True).cuda()
+
+    dcn_v2 = DCNv2(inC, outC, (kH, kW),
+                   stride=1, padding=1, dilation=1,
+                   deformable_groups=deformable_groups).cuda()
+
+    conv_offset.weight.data.zero_()
+    conv_offset.bias.data.zero_()
+    conv_mask.weight.data.zero_()
+    conv_mask.bias.data.zero_()
+    conv_identify(dcn_v2.weight, dcn_v2.bias)
+
+    input = torch.randn(N, inC, inH, inW).cuda()
+    offset = conv_offset(input)
+    mask = conv_mask(input)
+    mask = torch.sigmoid(mask)
+    output = dcn_v2(input, offset, mask)
+    output *= 2
+    d = (input - output).abs().max()
+    if d < 1e-10:
+        print('Zero offset passed')
+    else:
+        print('Zero offset failed')
+        print(input)
+        print(output)
+
+def check_gradient_dconv():
+
+    input = torch.rand(N, inC, inH, inW).cuda() * 0.01
+    input.requires_grad = True
+
+    offset = torch.randn(N, deformable_groups * 2 * kW * kH, inH, inW).cuda() * 2
+    # offset.data.zero_()
+    # offset.data -= 0.5
+    offset.requires_grad = True
+
+    mask = torch.rand(N, deformable_groups * 1 * kW * kH, inH, inW).cuda()
+    # mask.data.zero_()
+    mask.requires_grad = True
+    mask = torch.sigmoid(mask)
+
+    weight = torch.randn(outC, inC, kH, kW).cuda()
+    weight.requires_grad = True
+
+    bias = torch.rand(outC).cuda()
+    bias.requires_grad = True
+
+    stride = 1
+    padding = 1
+    dilation = 1
+
+    print('check_gradient_dconv: ',
+          gradcheck(dcn_v2_conv, (input, offset, mask, weight, bias,
+                    stride, padding, dilation, deformable_groups),
+                    eps=1e-3, atol=1e-4, rtol=1e-2))
+
+
+def check_pooling_zero_offset():
+
+    input = torch.randn(2, 16, 64, 64).cuda().zero_()
+    input[0, :, 16:26, 16:26] = 1.
+    input[1, :, 10:20, 20:30] = 2.
+    rois = torch.tensor([
+        [0, 65, 65, 103, 103],
+        [1, 81, 41, 119, 79],
+    ]).cuda().float()
+    pooling = DCNv2Pooling(spatial_scale=1.0 / 4,
+                           pooled_size=7,
+                           output_dim=16,
+                           no_trans=True,
+                           group_size=1,
+                           trans_std=0.0).cuda()
+
+    out = pooling(input, rois, input.new())
+    s = ', '.join(['%f' % out[i, :, :, :].mean().item()
+                   for i in range(rois.shape[0])])
+    print(s)
+
+    dpooling = DCNv2Pooling(spatial_scale=1.0 / 4,
+                            pooled_size=7,
+                            output_dim=16,
+                            no_trans=False,
+                            group_size=1,
+                            trans_std=0.0).cuda()
+    offset = torch.randn(20, 2, 7, 7).cuda().zero_()
+    dout = dpooling(input, rois, offset)
+    s = ', '.join(['%f' % dout[i, :, :, :].mean().item()
+                   for i in range(rois.shape[0])])
+    print(s)
+
+
+def check_gradient_dpooling():
+    input = torch.randn(2, 3, 5, 5).cuda() * 0.01
+    N = 4
+    batch_inds = torch.randint(2, (N, 1)).cuda().float()
+    x = torch.rand((N, 1)).cuda().float() * 15
+    y = torch.rand((N, 1)).cuda().float() * 15
+    w = torch.rand((N, 1)).cuda().float() * 10
+    h = torch.rand((N, 1)).cuda().float() * 10
+    rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
+    offset = torch.randn(N, 2, 3, 3).cuda()
+    input.requires_grad = True
+    offset.requires_grad = True
+
+    spatial_scale = 1.0 / 4
+    pooled_size = 3
+    output_dim = 3
+    no_trans = 0
+    group_size = 1
+    trans_std = 0.0
+    sample_per_part = 4
+    part_size = pooled_size
+
+    print('check_gradient_dpooling:',
+          gradcheck(dcn_v2_pooling, (input, rois, offset,
+                                     spatial_scale,
+                                     pooled_size,
+                                     output_dim,
+                                     no_trans,
+                                     group_size,
+                                     part_size,
+                                     sample_per_part,
+                                     trans_std),
+                    eps=1e-4))
+
+
+def example_dconv():
+    input = torch.randn(2, 64, 128, 128).cuda()
+    # wrap all things (offset and mask) in DCN
+    dcn = DCN(64, 64, kernel_size=(3, 3), stride=1,
+              padding=1, deformable_groups=2).cuda()
+    # print(dcn.weight.shape, input.shape)
+    output = dcn(input)
+    targert = output.new(*output.size())
+    targert.data.uniform_(-0.01, 0.01)
+    error = (targert - output).mean()
+    error.backward()
+    print(output.shape)
+
+
+def example_dpooling():
+    input = torch.randn(2, 32, 64, 64).cuda()
+    batch_inds = torch.randint(2, (20, 1)).cuda().float()
+    x = torch.randint(256, (20, 1)).cuda().float()
+    y = torch.randint(256, (20, 1)).cuda().float()
+    w = torch.randint(64, (20, 1)).cuda().float()
+    h = torch.randint(64, (20, 1)).cuda().float()
+    rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
+    offset = torch.randn(20, 2, 7, 7).cuda()
+    input.requires_grad = True
+    offset.requires_grad = True
+
+    # normal roi_align
+    pooling = DCNv2Pooling(spatial_scale=1.0 / 4,
+                           pooled_size=7,
+                           output_dim=32,
+                           no_trans=True,
+                           group_size=1,
+                           trans_std=0.1).cuda()
+
+    # deformable pooling
+    dpooling = DCNv2Pooling(spatial_scale=1.0 / 4,
+                            pooled_size=7,
+                            output_dim=32,
+                            no_trans=False,
+                            group_size=1,
+                            trans_std=0.1).cuda()
+
+    out = pooling(input, rois, offset)
+    dout = dpooling(input, rois, offset)
+    print(out.shape)
+    print(dout.shape)
+
+    target_out = out.new(*out.size())
+    target_out.data.uniform_(-0.01, 0.01)
+    target_dout = dout.new(*dout.size())
+    target_dout.data.uniform_(-0.01, 0.01)
+    e = (target_out - out).mean()
+    e.backward()
+    e = (target_dout - dout).mean()
+    e.backward()
+
+
+def example_mdpooling():
+    input = torch.randn(2, 32, 64, 64).cuda()
+    input.requires_grad = True
+    batch_inds = torch.randint(2, (20, 1)).cuda().float()
+    x = torch.randint(256, (20, 1)).cuda().float()
+    y = torch.randint(256, (20, 1)).cuda().float()
+    w = torch.randint(64, (20, 1)).cuda().float()
+    h = torch.randint(64, (20, 1)).cuda().float()
+    rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
+
+    # mdformable pooling (V2)
+    dpooling = DCNPooling(spatial_scale=1.0 / 4,
+                          pooled_size=7,
+                          output_dim=32,
+                          no_trans=False,
+                          group_size=1,
+                          trans_std=0.1,
+                          deform_fc_dim=1024).cuda()
+
+    dout = dpooling(input, rois)
+    target = dout.new(*dout.size())
+    target.data.uniform_(-0.1, 0.1)
+    error = (target - dout).mean()
+    error.backward()
+    print(dout.shape)
+
+
+if __name__ == '__main__':
+
+    example_dconv()
+    example_dpooling()
+    example_mdpooling()
+
+    check_pooling_zero_offset()
+    # zero offset check
+    if inC == outC:
+        check_zero_offset()
+
+    check_gradient_dpooling()
+    check_gradient_dconv()
+    # """
+    # ****** Note: backward is not reentrant error may not be a serious problem,
+    # ****** since the max error is less than 1e-7,
+    # ****** Still looking for what trigger this problem
+    # """
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/LICENSE b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/LICENSE
new file mode 100644
index 00000000..b2e3b520
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2019, Charles Shang
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/__init__.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/build/temp.linux-x86_64-cpython-37/build.ninja b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/build/temp.linux-x86_64-cpython-37/build.ninja
new file mode 100644
index 00000000..d05f7a42
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/build/temp.linux-x86_64-cpython-37/build.ninja
@@ -0,0 +1,23 @@
+ninja_required_version = 1.3
+cxx = c++
+
+cflags = -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/src -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/TH -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/THC -I/ssd8/exec/huangjy/miniconda3/envs/lore/include/python3.7m -c
+post_cflags = -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++14
+ldflags = 
+
+rule compile
+  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags
+  depfile = $out.d
+  deps = gcc
+
+
+
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_cpu.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_cpu.cpp
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_im2col_cpu.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_im2col_cpu.cpp
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_psroi_pooling_cpu.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_psroi_pooling_cpu.cpp
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/build/temp.linux-x86_64-cpython-37/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/src/vision.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/DCNv2_1.4/src/vision.cpp
+
+
+
+
+
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/dcn_v2.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/dcn_v2.py
new file mode 100644
index 00000000..d3f44c08
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/dcn_v2.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import math
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.nn.modules.utils import _pair
+from torch.autograd.function import once_differentiable
+from torch.onnx.symbolic_helper import parse_args
+import _ext as _backend
+
+
+class _DCNv2(Function):
+    #@parse_args('v', 'v', "v", 'v', 'v', "i", "i", "i", "i", "i")
+    #def symbolic(g, input, offset, weight, stride=1, padding=0,\
+    #           dilation=1, groups=1,\
+    #           deformable_groups=1, im2col_step=64):
+    #    return g.op("DeformConv", input, offset, weight,strides_i=stride, \
+    #            padding_i = padding, dilation_i=dilation, groups_i=groups, \
+    #            deformable_group_i=deformable_groups, im2col_step_i = im2col_step, auto_pad_s="None")
+
+    @staticmethod
+    def forward(ctx, input, offset, mask, weight, bias,
+                stride, padding, dilation, deformable_groups):
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.kernel_size = _pair(weight.shape[2:4])
+        ctx.deformable_groups = deformable_groups
+        output = _backend.dcn_v2_forward(input, weight, bias,
+                                         offset, mask,
+                                         ctx.kernel_size[0], ctx.kernel_size[1],
+                                         ctx.stride[0], ctx.stride[1],
+                                         ctx.padding[0], ctx.padding[1],
+                                         ctx.dilation[0], ctx.dilation[1],
+                                         ctx.deformable_groups)
+        ctx.save_for_backward(input, offset, mask, weight, bias)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, offset, mask, weight, bias = ctx.saved_tensors
+        grad_input, grad_offset, grad_mask, grad_weight, grad_bias = \
+            _backend.dcn_v2_backward(input, weight,
+                                     bias,
+                                     offset, mask,
+                                     grad_output,
+                                     ctx.kernel_size[0], ctx.kernel_size[1],
+                                     ctx.stride[0], ctx.stride[1],
+                                     ctx.padding[0], ctx.padding[1],
+                                     ctx.dilation[0], ctx.dilation[1],
+                                     ctx.deformable_groups)
+
+        return grad_input, grad_offset, grad_mask, grad_weight, grad_bias,\
+            None, None, None, None,
+
+
+dcn_v2_conv = _DCNv2.apply
+
+
+class DCNv2(nn.Module):
+
+    def __init__(self, in_channels, out_channels,
+                 kernel_size, stride, padding, dilation=1, deformable_groups=1):
+        super(DCNv2, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.deformable_groups = deformable_groups
+
+        self.weight = nn.Parameter(torch.Tensor(
+            out_channels, in_channels, *self.kernel_size))
+        self.bias = nn.Parameter(torch.Tensor(out_channels))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+        self.bias.data.zero_()
+
+    def forward(self, input, offset, mask):
+        assert 2 * self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \
+            offset.shape[1]
+        assert self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \
+            mask.shape[1]
+        return dcn_v2_conv(input, offset, mask,
+                           self.weight,
+                           self.bias,
+                           self.stride,
+                           self.padding,
+                           self.dilation,
+                           self.deformable_groups)
+
+
+class DCN(DCNv2):
+
+    def __init__(self, in_channels, out_channels,
+                 kernel_size, stride, padding,
+                 dilation=1, deformable_groups=1):
+        super(DCN, self).__init__(in_channels, out_channels,
+                                  kernel_size, stride, padding, dilation, deformable_groups)
+
+        channels_ = self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1]
+        self.conv_offset_mask = nn.Conv2d(self.in_channels,
+                                          channels_,
+                                          kernel_size=self.kernel_size,
+                                          stride=self.stride,
+                                          padding=self.padding,
+                                          bias=True)
+        self.init_offset()
+
+    def init_offset(self):
+        self.conv_offset_mask.weight.data.zero_()
+        self.conv_offset_mask.bias.data.zero_()
+        
+    #@parse_args('v', 'v', "v", 'v', 'v', "i", "i", "i", "i", "i")
+    #def symbolic(g, input, offset, weight, stride=1, padding=0,
+    #           dilation=1, groups=1,
+    #           deformable_groups=1, im2col_step=64):
+    #    return g.op("DeformConv", input, offset, weight,strides_i=stride,
+    #            padding_i = padding, dilation_i=dilation, groups_i=groups,
+    #            deformable_group_i=deformable_groups, im2col_step_i = im2col_step, auto_pad_s="None")
+    
+    def forward(self, input):
+        out = self.conv_offset_mask(input)
+        o1, o2, mask = torch.chunk(out, 3, dim=1)
+        offset = torch.cat((o1, o2), dim=1)
+        mask = torch.sigmoid(mask)
+        return dcn_v2_conv(input, offset, mask,
+                           self.weight, self.bias,
+                           self.stride,
+                           self.padding,
+                           self.dilation,
+                           self.deformable_groups)
+
+
+
+class _DCNv2Pooling(Function):
+    @staticmethod
+    def forward(ctx, input, rois, offset,
+                spatial_scale,
+                pooled_size,
+                output_dim,
+                no_trans,
+                group_size=1,
+                part_size=None,
+                sample_per_part=4,
+                trans_std=.0):
+        ctx.spatial_scale = spatial_scale
+        ctx.no_trans = int(no_trans)
+        ctx.output_dim = output_dim
+        ctx.group_size = group_size
+        ctx.pooled_size = pooled_size
+        ctx.part_size = pooled_size if part_size is None else part_size
+        ctx.sample_per_part = sample_per_part
+        ctx.trans_std = trans_std
+
+        output, output_count = \
+            _backend.dcn_v2_psroi_pooling_forward(input, rois, offset,
+                                                  ctx.no_trans, ctx.spatial_scale,
+                                                  ctx.output_dim, ctx.group_size,
+                                                  ctx.pooled_size, ctx.part_size,
+                                                  ctx.sample_per_part, ctx.trans_std)
+        ctx.save_for_backward(input, rois, offset, output_count)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, rois, offset, output_count = ctx.saved_tensors
+        grad_input, grad_offset = \
+            _backend.dcn_v2_psroi_pooling_backward(grad_output,
+                                                   input,
+                                                   rois,
+                                                   offset,
+                                                   output_count,
+                                                   ctx.no_trans,
+                                                   ctx.spatial_scale,
+                                                   ctx.output_dim,
+                                                   ctx.group_size,
+                                                   ctx.pooled_size,
+                                                   ctx.part_size,
+                                                   ctx.sample_per_part,
+                                                   ctx.trans_std)
+
+        return grad_input, None, grad_offset, \
+            None, None, None, None, None, None, None, None
+
+
+dcn_v2_pooling = _DCNv2Pooling.apply
+
+
+class DCNv2Pooling(nn.Module):
+
+    def __init__(self,
+                 spatial_scale,
+                 pooled_size,
+                 output_dim,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0):
+        super(DCNv2Pooling, self).__init__()
+        self.spatial_scale = spatial_scale
+        self.pooled_size = pooled_size
+        self.output_dim = output_dim
+        self.no_trans = no_trans
+        self.group_size = group_size
+        self.part_size = pooled_size if part_size is None else part_size
+        self.sample_per_part = sample_per_part
+        self.trans_std = trans_std
+
+    def forward(self, input, rois, offset):
+        assert input.shape[1] == self.output_dim
+        if self.no_trans:
+            offset = input.new()
+        return dcn_v2_pooling(input, rois, offset,
+                              self.spatial_scale,
+                              self.pooled_size,
+                              self.output_dim,
+                              self.no_trans,
+                              self.group_size,
+                              self.part_size,
+                              self.sample_per_part,
+                              self.trans_std)
+
+
+class DCNPooling(DCNv2Pooling):
+
+    def __init__(self,
+                 spatial_scale,
+                 pooled_size,
+                 output_dim,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0,
+                 deform_fc_dim=1024):
+        super(DCNPooling, self).__init__(spatial_scale,
+                                         pooled_size,
+                                         output_dim,
+                                         no_trans,
+                                         group_size,
+                                         part_size,
+                                         sample_per_part,
+                                         trans_std)
+
+        self.deform_fc_dim = deform_fc_dim
+
+        if not no_trans:
+            self.offset_mask_fc = nn.Sequential(
+                nn.Linear(self.pooled_size * self.pooled_size *
+                          self.output_dim, self.deform_fc_dim),
+                nn.ReLU(inplace=True),
+                nn.Linear(self.deform_fc_dim, self.deform_fc_dim),
+                nn.ReLU(inplace=True),
+                nn.Linear(self.deform_fc_dim, self.pooled_size *
+                          self.pooled_size * 3)
+            )
+            self.offset_mask_fc[4].weight.data.zero_()
+            self.offset_mask_fc[4].bias.data.zero_()
+
+    def forward(self, input, rois):
+        offset = input.new()
+
+        if not self.no_trans:
+
+            # do roi_align first
+            n = rois.shape[0]
+            roi = dcn_v2_pooling(input, rois, offset,
+                                 self.spatial_scale,
+                                 self.pooled_size,
+                                 self.output_dim,
+                                 True,  # no trans
+                                 self.group_size,
+                                 self.part_size,
+                                 self.sample_per_part,
+                                 self.trans_std)
+
+            # build mask and offset
+            offset_mask = self.offset_mask_fc(roi.view(n, -1))
+            offset_mask = offset_mask.view(
+                n, 3, self.pooled_size, self.pooled_size)
+            o1, o2, mask = torch.chunk(offset_mask, 3, dim=1)
+            offset = torch.cat((o1, o2), dim=1)
+            mask = torch.sigmoid(mask)
+
+            # do pooling with offset and mask
+            return dcn_v2_pooling(input, rois, offset,
+                                  self.spatial_scale,
+                                  self.pooled_size,
+                                  self.output_dim,
+                                  self.no_trans,
+                                  self.group_size,
+                                  self.part_size,
+                                  self.sample_per_part,
+                                  self.trans_std) * mask
+        # only roi_align
+        return dcn_v2_pooling(input, rois, offset,
+                              self.spatial_scale,
+                              self.pooled_size,
+                              self.output_dim,
+                              self.no_trans,
+                              self.group_size,
+                              self.part_size,
+                              self.sample_per_part,
+                              self.trans_std)
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/make.sh b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/make.sh
new file mode 100755
index 00000000..f1f15c0e
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/make.sh
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+python setup.py build develop
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/setup.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/setup.py
new file mode 100644
index 00000000..0e0dc964
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/setup.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+
+import os
+import glob
+
+import torch
+
+from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CppExtension
+from torch.utils.cpp_extension import CUDAExtension
+
+from setuptools import find_packages
+from setuptools import setup
+
+requirements = ["torch", "torchvision"]
+
+
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "src")
+
+    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
+    source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
+    
+    os.environ["CC"] = "g++"
+    sources = main_file + source_cpu
+    extension = CppExtension
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+
+    
+    if torch.cuda.is_available() and CUDA_HOME is not None:
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+    else:
+        #raise NotImplementedError('Cuda is not available')
+        pass
+    
+
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+    include_dirs = [extensions_dir]
+    ext_modules = [
+        extension(
+            "_ext",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+    return ext_modules
+
+setup(
+    name="DCNv2",
+    version="0.1",
+    author="charlesshang",
+    url="https://github.com/charlesshang/DCNv2",
+    description="deformable convolutional networks",
+    packages=find_packages(exclude=("configs", "tests",)),
+    # install_requires=requirements,
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_cpu.cpp b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_cpu.cpp
new file mode 100644
index 00000000..a94273e2
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_cpu.cpp
@@ -0,0 +1,233 @@
+#include <vector>
+#include "cpu/dcn_v2_im2col_cpu.h"
+
+#include <ATen/ATen.h>
+//#include <ATen/cuda/CUDAContext.h>
+
+#include <TH/TH.h>
+//#include <THC/THCAtomics.cuh>
+//#include <THC/THCDeviceUtils.cuh>
+
+//extern THCState *state;
+
+// author: Charles Shang
+// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
+// modified from the CUDA version for CPU use by Daniel K. Suhendro
+
+at::Tensor
+dcn_v2_cpu_forward(const at::Tensor &input,
+                    const at::Tensor &weight,
+                    const at::Tensor &bias,
+                    const at::Tensor &offset,
+                    const at::Tensor &mask,
+                    const int kernel_h,
+                    const int kernel_w,
+                    const int stride_h,
+                    const int stride_w,
+                    const int pad_h,
+                    const int pad_w,
+                    const int dilation_h,
+                    const int dilation_w,
+                    const int deformable_group)
+{
+    // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask));
+    /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+    AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
+    AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
+    AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
+    AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");*/
+
+    const int batch = input.size(0);
+    const int channels = input.size(1);
+    const int height = input.size(2);
+    const int width = input.size(3);
+
+    const int channels_out = weight.size(0);
+    const int channels_kernel = weight.size(1);
+    const int kernel_h_ = weight.size(2);
+    const int kernel_w_ = weight.size(3);
+
+    // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h);
+    // printf("Channels: %d %d\n", channels, channels_kernel);
+    // printf("Channels: %d %d\n", channels_out, channels_kernel);
+
+    AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
+               "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
+
+    AT_ASSERTM(channels == channels_kernel,
+               "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
+
+    const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+    const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+    auto ones = at::ones({height_out, width_out}, input.options());
+    auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
+    auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
+
+    using scalar_t = float;
+    for (int b = 0; b < batch; b++)
+    {
+        auto input_n = input.select(0, b);
+        auto offset_n = offset.select(0, b);
+        auto mask_n = mask.select(0, b);
+        auto output_n = output.select(0, b);
+
+        // Do Bias first:
+        // M,N,K are dims of matrix A and B
+        // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+        // (N x 1) (1 x M)
+        long m_ = channels_out;
+        long n_ = height_out * width_out;
+        long k_ = 1;
+        THFloatBlas_gemm('t', 'n', n_, m_, k_, 1.0f,
+                         ones.contiguous().data<scalar_t>(), k_,
+                         bias.contiguous().data<scalar_t>(), k_, 0.0f,
+                         output_n.data<scalar_t>(), n_);
+
+        modulated_deformable_im2col_cpu(input_n.data<scalar_t>(),
+                                         offset_n.data<scalar_t>(),
+                                         mask_n.data<scalar_t>(),
+                                         1, channels, height, width,
+                                         height_out, width_out, kernel_h, kernel_w,
+                                         pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                                         deformable_group,
+                                         columns.data<scalar_t>());
+
+        //(k * m)  x  (m * n)
+        // Y = WC
+        long m = channels_out;
+        long n = height_out * width_out;
+        long k = channels * kernel_h * kernel_w;
+        THFloatBlas_gemm('n', 'n', n, m, k, 1.0f,
+                         columns.data<scalar_t>(), n,
+                         weight.data<scalar_t>(), k, 1.0f,
+                         output_n.data<scalar_t>(), n);
+    }
+    return output;
+}
+
+std::vector<at::Tensor> dcn_v2_cpu_backward(const at::Tensor &input,
+                                             const at::Tensor &weight,
+                                             const at::Tensor &bias,
+                                             const at::Tensor &offset,
+                                             const at::Tensor &mask,
+                                             const at::Tensor &grad_output,
+                                             int kernel_h, int kernel_w,
+                                             int stride_h, int stride_w,
+                                             int pad_h, int pad_w,
+                                             int dilation_h, int dilation_w,
+                                             int deformable_group)
+{
+
+    THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous");
+    THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous");
+
+    /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+    AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
+    AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
+    AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
+    AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");*/
+
+    const int batch = input.size(0);
+    const int channels = input.size(1);
+    const int height = input.size(2);
+    const int width = input.size(3);
+
+    const int channels_out = weight.size(0);
+    const int channels_kernel = weight.size(1);
+    const int kernel_h_ = weight.size(2);
+    const int kernel_w_ = weight.size(3);
+
+    AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
+               "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
+
+    AT_ASSERTM(channels == channels_kernel,
+               "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
+
+    const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+    const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+    auto ones = at::ones({height_out, width_out}, input.options());
+    auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
+    auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
+
+    auto grad_input = at::zeros_like(input);
+    auto grad_weight = at::zeros_like(weight);
+    auto grad_bias = at::zeros_like(bias);
+    auto grad_offset = at::zeros_like(offset);
+    auto grad_mask = at::zeros_like(mask);
+
+    using scalar_t = float;
+
+    for (int b = 0; b < batch; b++)
+    {
+        auto input_n = input.select(0, b);
+        auto offset_n = offset.select(0, b);
+        auto mask_n = mask.select(0, b);
+        auto grad_output_n = grad_output.select(0, b);
+        auto grad_input_n = grad_input.select(0, b);
+        auto grad_offset_n = grad_offset.select(0, b);
+        auto grad_mask_n = grad_mask.select(0, b);
+
+        long m = channels * kernel_h * kernel_w;
+        long n = height_out * width_out;
+        long k = channels_out;
+
+        THFloatBlas_gemm('n', 't', n, m, k, 1.0f,
+                         grad_output_n.data<scalar_t>(), n,
+                         weight.data<scalar_t>(), m, 0.0f,
+                         columns.data<scalar_t>(), n);
+
+        // gradient w.r.t. input coordinate data
+        modulated_deformable_col2im_coord_cpu(columns.data<scalar_t>(),
+                                               input_n.data<scalar_t>(),
+                                               offset_n.data<scalar_t>(),
+                                               mask_n.data<scalar_t>(),
+                                               1, channels, height, width,
+                                               height_out, width_out, kernel_h, kernel_w,
+                                               pad_h, pad_w, stride_h, stride_w,
+                                               dilation_h, dilation_w, deformable_group,
+                                               grad_offset_n.data<scalar_t>(),
+                                               grad_mask_n.data<scalar_t>());
+        // gradient w.r.t. input data
+        modulated_deformable_col2im_cpu(columns.data<scalar_t>(),
+                                         offset_n.data<scalar_t>(),
+                                         mask_n.data<scalar_t>(),
+                                         1, channels, height, width,
+                                         height_out, width_out, kernel_h, kernel_w,
+                                         pad_h, pad_w, stride_h, stride_w,
+                                         dilation_h, dilation_w, deformable_group,
+                                         grad_input_n.data<scalar_t>());
+
+        // gradient w.r.t. weight, dWeight should accumulate across the batch and group
+        modulated_deformable_im2col_cpu(input_n.data<scalar_t>(),
+                                         offset_n.data<scalar_t>(),
+                                         mask_n.data<scalar_t>(),
+                                         1, channels, height, width,
+                                         height_out, width_out, kernel_h, kernel_w,
+                                         pad_h, pad_w, stride_h, stride_w,
+                                         dilation_h, dilation_w, deformable_group,
+                                         columns.data<scalar_t>());
+
+        long m_ = channels_out;
+        long n_ = channels * kernel_h * kernel_w;
+        long k_ = height_out * width_out;
+
+        THFloatBlas_gemm('t', 'n', n_, m_, k_, 1.0f,
+                         columns.data<scalar_t>(), k_,
+                         grad_output_n.data<scalar_t>(), k_, 1.0f,
+                         grad_weight.data<scalar_t>(), n_);
+
+        // gradient w.r.t. bias
+        // long m_ = channels_out;
+        // long k__ = height_out * width_out;
+        THFloatBlas_gemv('t', k_, m_, 1.0f,
+                         grad_output_n.data<scalar_t>(), k_,
+                         ones.data<scalar_t>(), 1, 1.0f,
+                         grad_bias.data<scalar_t>(), 1);
+    }
+
+    return {
+        grad_input, grad_offset, grad_mask, grad_weight, grad_bias
+    };
+}
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_im2col_cpu.cpp b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_im2col_cpu.cpp
new file mode 100644
index 00000000..1704a60d
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_im2col_cpu.cpp
@@ -0,0 +1,395 @@
+#include "dcn_v2_im2col_cpu.h"
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+
+#include <ATen/ATen.h>
+//#include <ATen/cuda/CUDAContext.h>
+
+#include <TH/TH.h>
+//#include <THC/THCAtomics.cuh>
+//#include <THC/THCDeviceUtils.cuh>
+
+// modified from the CUDA version for CPU use by Daniel K. Suhendro
+
+/*#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N)
+{
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}*/
+
+
+float dmcn_im2col_bilinear_cpu(const float *bottom_data, const int data_width,
+                           const int height, const int width, float h, float w)
+{
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  float lh = h - h_low;
+  float lw = w - w_low;
+  float hh = 1 - lh, hw = 1 - lw;
+
+  float v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  float v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  float v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  float v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+float dmcn_get_gradient_weight_cpu(float argmax_h, float argmax_w,
+                               const int h, const int w, const int height, const int width)
+{
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  float weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+float dmcn_get_coordinate_weight_cpu(float argmax_h, float argmax_w,
+                                 const int height, const int width, const float *im_data,
+                                 const int data_width, const int bp_dir)
+{
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  float weight = 0;
+
+  if (bp_dir == 0)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+  else if (bp_dir == 1)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+void modulated_deformable_im2col_cpu_kernel(const int n, const float *data_im, const float *data_offset, const float *data_mask,
+                                                       const int height, const int width, const int kernel_h, const int kernel_w,
+                                                       const int pad_h, const int pad_w,
+                                                       const int stride_h, const int stride_w,
+                                                       const int dilation_h, const int dilation_w,
+                                                       const int channel_per_deformable_group,
+                                                       const int batch_size, const int num_channels, const int deformable_group,
+                                                       const int height_col, const int width_col,
+                                                       float *data_col)
+{
+  // launch channels * batch_size * height_col * width_col cores
+  for(int index=0; index<n; index++)
+  {
+    // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow)
+    // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis
+
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    // const int b_col = (index / width_col / height_col) % batch_size;
+    const int b_col = (index / width_col / height_col / num_channels) % batch_size;
+    // const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_im = (index / width_col / height_col) % num_channels;
+    // const int c_col = c_im * kernel_h * kernel_w;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    //  float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col;
+    //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
+    const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
+    const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+
+    const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i)
+    {
+      for (int j = 0; j < kernel_w; ++j)
+      {
+        const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+        const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const float offset_h = data_offset_ptr[data_offset_h_ptr];
+        const float offset_w = data_offset_ptr[data_offset_w_ptr];
+        const float mask = data_mask_ptr[data_mask_hw_ptr];
+        float val = static_cast<float>(0);
+        const float h_im = h_in + i * dilation_h + offset_h;
+        const float w_im = w_in + j * dilation_w + offset_w;
+        //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+        {
+          //const float map_h = i * dilation_h + offset_h;
+          //const float map_w = j * dilation_w + offset_w;
+          //const int cur_height = height - h_in;
+          //const int cur_width = width - w_in;
+          //val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
+          val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        // data_col_ptr += batch_size * height_col * width_col;
+        data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+void modulated_deformable_col2im_cpu_kernel(const int n, const float *data_col, const float *data_offset, const float *data_mask,
+                                                       const int channels, const int height, const int width,
+                                                       const int kernel_h, const int kernel_w,
+                                                       const int pad_h, const int pad_w,
+                                                       const int stride_h, const int stride_w,
+                                                       const int dilation_h, const int dilation_w,
+                                                       const int channel_per_deformable_group,
+                                                       const int batch_size, const int deformable_group,
+                                                       const int height_col, const int width_col,
+                                                       float *grad_im)
+{
+  for(int index = 0; index < n; index++)
+  {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+    const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const float offset_h = data_offset_ptr[data_offset_h_ptr];
+    const float offset_w = data_offset_ptr[data_offset_w_ptr];
+    const float mask = data_mask_ptr[data_mask_hw_ptr];
+    const float cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const float cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const float cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    
+    for (int dy = -2; dy <= 2; dy++)
+    {
+      for (int dx = -2; dx <= 2; dx++)
+      {
+        if (cur_h + dy >= 0 && cur_h + dy < height &&
+            cur_w + dx >= 0 && cur_w + dx < width &&
+            abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1)
+        {
+          int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          float weight = dmcn_get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
+          //atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
+
+        }
+      }
+    }
+  }
+}
+
+void modulated_deformable_col2im_coord_cpu_kernel(const int n, const float *data_col, const float *data_im,
+                                                             const float *data_offset, const float *data_mask,
+                                                             const int channels, const int height, const int width,
+                                                             const int kernel_h, const int kernel_w,
+                                                             const int pad_h, const int pad_w,
+                                                             const int stride_h, const int stride_w,
+                                                             const int dilation_h, const int dilation_w,
+                                                             const int channel_per_deformable_group,
+                                                             const int batch_size, const int offset_channels, const int deformable_group,
+                                                             const int height_col, const int width_col,
+                                                             float *grad_offset, float *grad_mask)
+{
+  for(int index = 0; index < n; index++)
+  {
+    float val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
+    const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+    const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
+    {
+      const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
+      const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const float offset_h = data_offset_ptr[data_offset_h_ptr];
+      const float offset_w = data_offset_ptr[data_offset_w_ptr];
+      const float mask = data_mask_ptr[data_mask_hw_ptr];
+      float inv_h = h_in + i * dilation_h + offset_h;
+      float inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+      {
+        inv_h = inv_w = -2;
+      }
+      else
+      {
+        mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear_cpu(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
+      }
+      const float weight = dmcn_get_coordinate_weight_cpu(
+          inv_h, inv_w,
+          height, width, data_im_ptr + cnt * height * width, width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
+  }
+}
+
+void modulated_deformable_im2col_cpu(const float* data_im, const float* data_offset, const float* data_mask,
+  const int batch_size, const int channels, const int height_im, const int width_im, 
+  const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+  const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
+  const int dilation_h, const int dilation_w,
+  const int deformable_group, float* data_col) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+  modulated_deformable_im2col_cpu_kernel(
+      num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w,
+      pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+      batch_size, channels, deformable_group, height_col, width_col, data_col);
+  
+  /*cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }*/
+
+}
+
+void modulated_deformable_col2im_cpu(const float* data_col, const float* data_offset, const float* data_mask,
+  const int batch_size, const int channels, const int height_im, const int width_im, 
+  const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+  const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
+  const int dilation_h, const int dilation_w, 
+  const int deformable_group, float* grad_im){
+
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+  modulated_deformable_col2im_cpu_kernel(
+        num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im,
+        kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
+        dilation_h, dilation_w, channel_per_deformable_group,
+        batch_size, deformable_group, height_col, width_col, grad_im);
+  /*cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }*/
+
+}
+
+void modulated_deformable_col2im_coord_cpu(const float* data_col, const float* data_im, const float* data_offset, const float* data_mask,
+  const int batch_size, const int channels, const int height_im, const int width_im, 
+  const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+  const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
+  const int dilation_h, const int dilation_w, 
+  const int deformable_group,
+  float* grad_offset, float* grad_mask) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
+  const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
+  modulated_deformable_col2im_coord_cpu_kernel(
+        num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im,
+        kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, channel_per_deformable_group,
+        batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, 
+        grad_offset, grad_mask);
+  /*cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
+  }*/
+}
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_im2col_cpu.h b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_im2col_cpu.h
new file mode 100644
index 00000000..bad5c528
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_im2col_cpu.h
@@ -0,0 +1,99 @@
+
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.h
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1811.11168
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu
+ */
+
+/***************** Adapted by Charles Shang *********************/
+// modified from the CUDA version for CPU use by Daniel K. Suhendro
+
+#ifndef DCN_V2_IM2COL_CPU
+#define DCN_V2_IM2COL_CPU
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  void modulated_deformable_im2col_cpu(const float *data_im, const float *data_offset, const float *data_mask,
+                                        const int batch_size, const int channels, const int height_im, const int width_im,
+                                        const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+                                        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+                                        const int dilation_h, const int dilation_w,
+                                        const int deformable_group, float *data_col);
+
+  void modulated_deformable_col2im_cpu(const float *data_col, const float *data_offset, const float *data_mask,
+                                        const int batch_size, const int channels, const int height_im, const int width_im,
+                                        const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+                                        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+                                        const int dilation_h, const int dilation_w,
+                                        const int deformable_group, float *grad_im);
+
+  void modulated_deformable_col2im_coord_cpu(const float *data_col, const float *data_im, const float *data_offset, const float *data_mask,
+                                         const int batch_size, const int channels, const int height_im, const int width_im,
+                                         const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+                                         const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+                                         const int dilation_h, const int dilation_w,
+                                         const int deformable_group,
+                                         float *grad_offset, float *grad_mask);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_psroi_pooling_cpu.cpp b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_psroi_pooling_cpu.cpp
new file mode 100644
index 00000000..6e41aaed
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/dcn_v2_psroi_pooling_cpu.cpp
@@ -0,0 +1,426 @@
+/*!
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file deformable_psroi_pooling.cu
+ * \brief
+ * \author Yi Li, Guodong Zhang, Jifeng Dai
+*/
+/***************** Adapted by Charles Shang *********************/
+// modified from the CUDA version for CPU use by Daniel K. Suhendro
+
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+
+#include <ATen/ATen.h>
+//#include <ATen/cuda/CUDAContext.h>
+
+#include <TH/TH.h>
+//#include <THC/THCAtomics.cuh>
+//#include <THC/THCDeviceUtils.cuh>
+
+/*#define CUDA_KERNEL_LOOP(i, n)                        \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+       i < (n);                                       \
+       i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N)
+{
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}*/
+
+template <typename T>
+T bilinear_interp_cpu(
+    const T *data,
+    const T x,
+    const T y,
+    const int width,
+    const int height)
+{
+  int x1 = floor(x);
+  int x2 = ceil(x);
+  int y1 = floor(y);
+  int y2 = ceil(y);
+  T dist_x = static_cast<T>(x - x1);
+  T dist_y = static_cast<T>(y - y1);
+  T value11 = data[y1 * width + x1];
+  T value12 = data[y2 * width + x1];
+  T value21 = data[y1 * width + x2];
+  T value22 = data[y2 * width + x2];
+  T value = (1 - dist_x) * (1 - dist_y) * value11 +
+            (1 - dist_x) * dist_y * value12 +
+            dist_x * (1 - dist_y) * value21 +
+            dist_x * dist_y * value22;
+  return value;
+}
+
+template <typename T>
+ void DeformablePSROIPoolForwardKernelCpu(
+    const int count,
+    const T *bottom_data,
+    const T spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const T *bottom_rois, const T *bottom_trans,
+    const int no_trans,
+    const T trans_std,
+    const int sample_per_part,
+    const int output_dim,
+    const int group_size,
+    const int part_size,
+    const int num_classes,
+    const int channels_each_class,
+    T *top_data,
+    T *top_count)
+{
+  for(int index = 0; index < count; index++)
+  {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const T *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+    // Force too small ROIs to be 1x1
+    T roi_width = std::max(roi_end_w - roi_start_w, T(0.1)); //avoid 0
+    T roi_height = std::max(roi_end_h - roi_start_h, T(0.1));
+
+    // Compute w and h at bottom
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
+    T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
+
+    int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
+    int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
+    T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
+
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    T sum = 0;
+    int count = 0;
+    int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
+    int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
+    gw = std::min(std::max(gw, 0), group_size - 1);
+    gh = std::min(std::max(gh, 0), group_size - 1);
+
+    const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
+    for (int ih = 0; ih < sample_per_part; ih++)
+    {
+      for (int iw = 0; iw < sample_per_part; iw++)
+      {
+        T w = wstart + iw * sub_bin_size_w;
+        T h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+        {
+          continue;
+        }
+        w = std::min(std::max(w, T(0.)), width - T(1.));
+        h = std::min(std::max(h, T(0.)), height - T(1.));
+        int c = (ctop * group_size + gh) * group_size + gw;
+        T val = bilinear_interp_cpu(offset_bottom_data + c * height * width, w, h, width, height);
+        sum += val;
+        count++;
+      }
+    }
+    top_data[index] = count == 0 ? static_cast<T>(0) : sum / count;
+    top_count[index] = count;
+  }
+}
+
+template <typename T>
+void DeformablePSROIPoolBackwardAccKernelCpu(
+    const int count,
+    const T *top_diff,
+    const T *top_count,
+    const int num_rois,
+    const T spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const int output_dim,
+    T *bottom_data_diff, T *bottom_trans_diff,
+    const T *bottom_data,
+    const T *bottom_rois,
+    const T *bottom_trans,
+    const int no_trans,
+    const T trans_std,
+    const int sample_per_part,
+    const int group_size,
+    const int part_size,
+    const int num_classes,
+    const int channels_each_class)
+{
+  for(int index = 0; index < count; index++)
+  {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const T *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+    
+    // Force too small ROIs to be 1x1
+    T roi_width = std::max(roi_end_w - roi_start_w, T(0.1)); //avoid 0
+    T roi_height = std::max(roi_end_h - roi_start_h, T(0.1));
+
+    // Compute w and h at bottom
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
+    T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
+
+    int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
+    int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
+    T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
+
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    if (top_count[index] <= 0)
+    {
+      continue;
+    }
+    T diff_val = top_diff[index] / top_count[index];
+    const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
+    T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
+    int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
+    int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
+    gw = std::min(std::max(gw, 0), group_size - 1);
+    gh = std::min(std::max(gh, 0), group_size - 1);
+
+    for (int ih = 0; ih < sample_per_part; ih++)
+    {
+      for (int iw = 0; iw < sample_per_part; iw++)
+      {
+        T w = wstart + iw * sub_bin_size_w;
+        T h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+        {
+          continue;
+        }
+        w = std::min(std::max(w, T(0.)), width - T(1.));
+        h = std::min(std::max(h, T(0.)), height - T(1.));
+        int c = (ctop * group_size + gh) * group_size + gw;
+        // backward on feature
+        int x0 = floor(w);
+        int x1 = ceil(w);
+        int y0 = floor(h);
+        int y1 = ceil(h);
+        T dist_x = w - x0, dist_y = h - y0;
+        T q00 = (1 - dist_x) * (1 - dist_y);
+        T q01 = (1 - dist_x) * dist_y;
+        T q10 = dist_x * (1 - dist_y);
+        T q11 = dist_x * dist_y;
+        int bottom_index_base = c * height * width;
+        /*atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);*/
+       *(offset_bottom_data_diff + bottom_index_base + y0 * width + x0) += q00 * diff_val;
+       *(offset_bottom_data_diff + bottom_index_base + y1 * width + x0) += q01 * diff_val;
+       *(offset_bottom_data_diff + bottom_index_base + y0 * width + x1) += q10 * diff_val;
+       *(offset_bottom_data_diff + bottom_index_base + y1 * width + x1) += q11 * diff_val;
+
+
+        if (no_trans)
+        {
+          continue;
+        }
+        T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
+        T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
+        T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
+        T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
+        T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
+        diff_x *= roi_width;
+        T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
+        diff_y *= roi_height;
+
+        /*atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
+        atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);*/
+        *(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w) += diff_x;
+        *(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w) += diff_y;
+      }
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input,
+                                  const at::Tensor &bbox,
+                                  const at::Tensor &trans,
+                                  const int no_trans,
+                                  const float spatial_scale,
+                                  const int output_dim,
+                                  const int group_size,
+                                  const int pooled_size,
+                                  const int part_size,
+                                  const int sample_per_part,
+                                  const float trans_std)
+{
+  /*AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor");
+  AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");*/
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+  const int num_bbox = bbox.size(0);
+
+  AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
+  auto pooled_height = pooled_size;
+  auto pooled_width = pooled_size;
+
+  auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
+  long out_size = num_bbox * output_dim * pooled_height * pooled_width;
+  auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
+
+  const int num_classes = no_trans ? 1 : channels_trans / 2;
+  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+  //cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (out.numel() == 0)
+  {
+    //THCudaCheck(cudaGetLastError());
+    return std::make_tuple(out, top_count);
+  }
+
+  /*dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
+  dim3 block(512);*/
+
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cpu_forward", [&] {
+    DeformablePSROIPoolForwardKernelCpu<scalar_t>(
+        out_size,
+        input.contiguous().data<scalar_t>(),
+        spatial_scale,
+        channels,
+        height, width,
+        pooled_height,
+        pooled_width,
+        bbox.contiguous().data<scalar_t>(),
+        trans.contiguous().data<scalar_t>(),
+        no_trans,
+        trans_std,
+        sample_per_part,
+        output_dim,
+        group_size,
+        part_size,
+        num_classes,
+        channels_each_class,
+        out.data<scalar_t>(),
+        top_count.data<scalar_t>());
+  });
+  //THCudaCheck(cudaGetLastError());
+  return std::make_tuple(out, top_count);
+}
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad,
+                                   const at::Tensor &input,
+                                   const at::Tensor &bbox,
+                                   const at::Tensor &trans,
+                                   const at::Tensor &top_count,
+                                   const int no_trans,
+                                   const float spatial_scale,
+                                   const int output_dim,
+                                   const int group_size,
+                                   const int pooled_size,
+                                   const int part_size,
+                                   const int sample_per_part,
+                                   const float trans_std)
+{
+  /*AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor");
+  AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor");
+  AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");
+  AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor");*/
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+  const int num_bbox = bbox.size(0);
+
+  AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
+  auto pooled_height = pooled_size;
+  auto pooled_width = pooled_size;
+  long out_size = num_bbox * output_dim * pooled_height * pooled_width;
+  const int num_classes = no_trans ? 1 : channels_trans / 2;
+  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+  auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options());
+  auto trans_grad = at::zeros_like(trans);
+
+  if (input_grad.numel() == 0)
+  {
+    //THCudaCheck(cudaGetLastError());
+    return std::make_tuple(input_grad, trans_grad);
+  }
+
+  /*dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
+  dim3 block(512);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();*/
+
+  AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cpu_backward", [&] {
+    DeformablePSROIPoolBackwardAccKernelCpu<scalar_t>(
+        out_size,
+        out_grad.contiguous().data<scalar_t>(),
+        top_count.contiguous().data<scalar_t>(),
+        num_bbox,
+        spatial_scale,
+        channels,
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        output_dim,
+        input_grad.contiguous().data<scalar_t>(),
+        trans_grad.contiguous().data<scalar_t>(),
+        input.contiguous().data<scalar_t>(),
+        bbox.contiguous().data<scalar_t>(),
+        trans.contiguous().data<scalar_t>(),
+        no_trans,
+        trans_std,
+        sample_per_part,
+        group_size,
+        part_size,
+        num_classes,
+        channels_each_class);
+  });
+  //THCudaCheck(cudaGetLastError());
+  return std::make_tuple(input_grad, trans_grad);
+}
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/vision.h b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/vision.h
new file mode 100644
index 00000000..d5fbf1f0
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cpu/vision.h
@@ -0,0 +1,60 @@
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor
+dcn_v2_cpu_forward(const at::Tensor &input,
+                    const at::Tensor &weight,
+                    const at::Tensor &bias,
+                    const at::Tensor &offset,
+                    const at::Tensor &mask,
+                    const int kernel_h,
+                    const int kernel_w,
+                    const int stride_h,
+                    const int stride_w,
+                    const int pad_h,
+                    const int pad_w,
+                    const int dilation_h,
+                    const int dilation_w,
+                    const int deformable_group);
+
+std::vector<at::Tensor>
+dcn_v2_cpu_backward(const at::Tensor &input,
+                     const at::Tensor &weight,
+                     const at::Tensor &bias,
+                     const at::Tensor &offset,
+                     const at::Tensor &mask,
+                     const at::Tensor &grad_output,
+                     int kernel_h, int kernel_w,
+                     int stride_h, int stride_w,
+                     int pad_h, int pad_w,
+                     int dilation_h, int dilation_w,
+                     int deformable_group);
+
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input,
+                                  const at::Tensor &bbox,
+                                  const at::Tensor &trans,
+                                  const int no_trans,
+                                  const float spatial_scale,
+                                  const int output_dim,
+                                  const int group_size,
+                                  const int pooled_size,
+                                  const int part_size,
+                                  const int sample_per_part,
+                                  const float trans_std);
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad,
+                                   const at::Tensor &input,
+                                   const at::Tensor &bbox,
+                                   const at::Tensor &trans,
+                                   const at::Tensor &top_count,
+                                   const int no_trans,
+                                   const float spatial_scale,
+                                   const int output_dim,
+                                   const int group_size,
+                                   const int pooled_size,
+                                   const int part_size,
+                                   const int sample_per_part,
+                                   const float trans_std);
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_cuda.cu b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_cuda.cu
new file mode 100644
index 00000000..9711a497
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_cuda.cu
@@ -0,0 +1,335 @@
+#include <vector>
+#include "cuda/dcn_v2_im2col_cuda.h"
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THC.h>
+#include <THC/THCAtomics.cuh>
+#include <THC/THCDeviceUtils.cuh>
+
+THCState *state = at::globalContext().lazyInitCUDA();
+
+// author: Charles Shang
+// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
+
+// [batch gemm]
+// https://github.com/pytorch/pytorch/blob/master/aten/src/THC/generic/THCTensorMathBlas.cu
+
+__global__ void createBatchGemmBuffer(const float **input_b, float **output_b,
+                                      float **columns_b, const float **ones_b,
+                                      const float **weight_b, const float **bias_b,
+                                      float *input, float *output,
+                                      float *columns, float *ones,
+                                      float *weight, float *bias,
+                                      const int input_stride, const int output_stride,
+                                      const int columns_stride, const int ones_stride,
+                                      const int num_batches)
+{
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < num_batches)
+    {
+        input_b[idx] = input + idx * input_stride;
+        output_b[idx] = output + idx * output_stride;
+        columns_b[idx] = columns + idx * columns_stride;
+        ones_b[idx] = ones + idx * ones_stride;
+        // share weights and bias within a Mini-Batch
+        weight_b[idx] = weight;
+        bias_b[idx] = bias;
+    }
+}
+
+at::Tensor
+dcn_v2_cuda_forward(const at::Tensor &input,
+                    const at::Tensor &weight,
+                    const at::Tensor &bias,
+                    const at::Tensor &offset,
+                    const at::Tensor &mask,
+                    const int kernel_h,
+                    const int kernel_w,
+                    const int stride_h,
+                    const int stride_w,
+                    const int pad_h,
+                    const int pad_w,
+                    const int dilation_h,
+                    const int dilation_w,
+                    const int deformable_group)
+{
+    using scalar_t = float;
+    // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask));
+    AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+    AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
+    AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
+    AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
+    AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
+
+    const int batch = input.size(0);
+    const int channels = input.size(1);
+    const int height = input.size(2);
+    const int width = input.size(3);
+
+    const int channels_out = weight.size(0);
+    const int channels_kernel = weight.size(1);
+    const int kernel_h_ = weight.size(2);
+    const int kernel_w_ = weight.size(3);
+
+    // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h);
+    // printf("Channels: %d %d\n", channels, channels_kernel);
+    // printf("Channels: %d %d\n", channels_out, channels_kernel);
+
+    AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
+               "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
+
+    AT_ASSERTM(channels == channels_kernel,
+               "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
+
+    const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+    const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+    auto ones = at::ones({batch, height_out, width_out}, input.options());
+    auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
+    auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
+
+    // prepare for batch-wise computing, which is significantly faster than instance-wise computing
+    // when batch size is large.
+    // launch batch threads
+    int matrices_size = batch * sizeof(float *);
+    auto input_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
+    auto output_b = static_cast<float **>(THCudaMalloc(state, matrices_size));
+    auto columns_b = static_cast<float **>(THCudaMalloc(state, matrices_size));
+    auto ones_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
+    auto weight_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
+    auto bias_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
+
+    const int block = 128;
+    const int grid = (batch + block - 1) / block;
+
+    createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+        input_b, output_b,
+        columns_b, ones_b,
+        weight_b, bias_b,
+        input.data<scalar_t>(),
+        output.data<scalar_t>(),
+        columns.data<scalar_t>(),
+        ones.data<scalar_t>(),
+        weight.data<scalar_t>(),
+        bias.data<scalar_t>(),
+        channels * width * height,
+        channels_out * width_out * height_out,
+        channels * kernel_h * kernel_w * height_out * width_out,
+        height_out * width_out,
+        batch);
+
+    long m_ = channels_out;
+    long n_ = height_out * width_out;
+    long k_ = 1;
+    THCudaBlas_SgemmBatched(state,
+                            't',
+                            'n',
+                            n_,
+                            m_,
+                            k_,
+                            1.0f,
+                            ones_b, k_,
+                            bias_b, k_,
+                            0.0f,
+                            output_b, n_,
+                            batch);
+
+    modulated_deformable_im2col_cuda(THCState_getCurrentStream(state),
+                                     input.data<scalar_t>(),
+                                     offset.data<scalar_t>(),
+                                     mask.data<scalar_t>(),
+                                     batch, channels, height, width,
+                                     height_out, width_out, kernel_h, kernel_w,
+                                     pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                                     deformable_group,
+                                     columns.data<scalar_t>());
+
+    long m = channels_out;
+    long n = height_out * width_out;
+    long k = channels * kernel_h * kernel_w;
+    THCudaBlas_SgemmBatched(state,
+                            'n',
+                            'n',
+                            n,
+                            m,
+                            k,
+                            1.0f,
+                            (const float **)columns_b, n,
+                            weight_b, k,
+                            1.0f,
+                            output_b, n,
+                            batch);
+
+    THCudaFree(state, input_b);
+    THCudaFree(state, output_b);
+    THCudaFree(state, columns_b);
+    THCudaFree(state, ones_b);
+    THCudaFree(state, weight_b);
+    THCudaFree(state, bias_b);
+    return output;
+}
+
+__global__ void createBatchGemmBufferBackward(
+    float **grad_output_b,
+    float **columns_b,
+    float **ones_b,
+    float **weight_b,
+    float **grad_weight_b,
+    float **grad_bias_b,
+    float *grad_output,
+    float *columns,
+    float *ones,
+    float *weight,
+    float *grad_weight,
+    float *grad_bias,
+    const int grad_output_stride,
+    const int columns_stride,
+    const int ones_stride,
+    const int num_batches)
+{
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < num_batches)
+    {
+        grad_output_b[idx] = grad_output + idx * grad_output_stride;
+        columns_b[idx] = columns + idx * columns_stride;
+        ones_b[idx] = ones + idx * ones_stride;
+
+        // share weights and bias within a Mini-Batch
+        weight_b[idx] = weight;
+        grad_weight_b[idx] = grad_weight;
+        grad_bias_b[idx] = grad_bias;
+    }
+}
+
+std::vector<at::Tensor> dcn_v2_cuda_backward(const at::Tensor &input,
+                                             const at::Tensor &weight,
+                                             const at::Tensor &bias,
+                                             const at::Tensor &offset,
+                                             const at::Tensor &mask,
+                                             const at::Tensor &grad_output,
+                                             int kernel_h, int kernel_w,
+                                             int stride_h, int stride_w,
+                                             int pad_h, int pad_w,
+                                             int dilation_h, int dilation_w,
+                                             int deformable_group)
+{
+
+    THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous");
+    THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous");
+
+    AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+    AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
+    AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
+    AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
+    AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
+
+    const int batch = input.size(0);
+    const int channels = input.size(1);
+    const int height = input.size(2);
+    const int width = input.size(3);
+
+    const int channels_out = weight.size(0);
+    const int channels_kernel = weight.size(1);
+    const int kernel_h_ = weight.size(2);
+    const int kernel_w_ = weight.size(3);
+
+    AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
+               "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
+
+    AT_ASSERTM(channels == channels_kernel,
+               "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
+
+    const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+    const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+    auto ones = at::ones({height_out, width_out}, input.options());
+    auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
+    auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
+
+    auto grad_input = at::zeros_like(input);
+    auto grad_weight = at::zeros_like(weight);
+    auto grad_bias = at::zeros_like(bias);
+    auto grad_offset = at::zeros_like(offset);
+    auto grad_mask = at::zeros_like(mask);
+
+    using scalar_t = float;
+
+    for (int b = 0; b < batch; b++)
+    {
+        auto input_n = input.select(0, b);
+        auto offset_n = offset.select(0, b);
+        auto mask_n = mask.select(0, b);
+        auto grad_output_n = grad_output.select(0, b);
+        auto grad_input_n = grad_input.select(0, b);
+        auto grad_offset_n = grad_offset.select(0, b);
+        auto grad_mask_n = grad_mask.select(0, b);
+
+        long m = channels * kernel_h * kernel_w;
+        long n = height_out * width_out;
+        long k = channels_out;
+
+        THCudaBlas_Sgemm(state, 'n', 't', n, m, k, 1.0f,
+                         grad_output_n.data<scalar_t>(), n,
+                         weight.data<scalar_t>(), m, 0.0f,
+                         columns.data<scalar_t>(), n);
+
+        // gradient w.r.t. input coordinate data
+        modulated_deformable_col2im_coord_cuda(THCState_getCurrentStream(state),
+                                               columns.data<scalar_t>(),
+                                               input_n.data<scalar_t>(),
+                                               offset_n.data<scalar_t>(),
+                                               mask_n.data<scalar_t>(),
+                                               1, channels, height, width,
+                                               height_out, width_out, kernel_h, kernel_w,
+                                               pad_h, pad_w, stride_h, stride_w,
+                                               dilation_h, dilation_w, deformable_group,
+                                               grad_offset_n.data<scalar_t>(),
+                                               grad_mask_n.data<scalar_t>());
+        // gradient w.r.t. input data
+        modulated_deformable_col2im_cuda(THCState_getCurrentStream(state),
+                                         columns.data<scalar_t>(),
+                                         offset_n.data<scalar_t>(),
+                                         mask_n.data<scalar_t>(),
+                                         1, channels, height, width,
+                                         height_out, width_out, kernel_h, kernel_w,
+                                         pad_h, pad_w, stride_h, stride_w,
+                                         dilation_h, dilation_w, deformable_group,
+                                         grad_input_n.data<scalar_t>());
+
+        // gradient w.r.t. weight, dWeight should accumulate across the batch and group
+        modulated_deformable_im2col_cuda(THCState_getCurrentStream(state),
+                                         input_n.data<scalar_t>(),
+                                         offset_n.data<scalar_t>(),
+                                         mask_n.data<scalar_t>(),
+                                         1, channels, height, width,
+                                         height_out, width_out, kernel_h, kernel_w,
+                                         pad_h, pad_w, stride_h, stride_w,
+                                         dilation_h, dilation_w, deformable_group,
+                                         columns.data<scalar_t>());
+
+        long m_ = channels_out;
+        long n_ = channels * kernel_h * kernel_w;
+        long k_ = height_out * width_out;
+
+        THCudaBlas_Sgemm(state, 't', 'n', n_, m_, k_, 1.0f,
+                         columns.data<scalar_t>(), k_,
+                         grad_output_n.data<scalar_t>(), k_, 1.0f,
+                         grad_weight.data<scalar_t>(), n_);
+
+        // gradient w.r.t. bias
+        // long m_ = channels_out;
+        // long k__ = height_out * width_out;
+        THCudaBlas_Sgemv(state,
+                         't',
+                         k_, m_, 1.0f,
+                         grad_output_n.data<scalar_t>(), k_,
+                         ones.data<scalar_t>(), 1, 1.0f,
+                         grad_bias.data<scalar_t>(), 1);
+    }
+
+    return {
+        grad_input, grad_offset, grad_mask, grad_weight, grad_bias
+    };
+}
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_im2col_cuda.cu b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_im2col_cuda.cu
new file mode 100644
index 00000000..4140eacf
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_im2col_cuda.cu
@@ -0,0 +1,402 @@
+#include "dcn_v2_im2col_cuda.h"
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THC.h>
+#include <THC/THCAtomics.cuh>
+#include <THC/THCDeviceUtils.cuh>
+
+#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N)
+{
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+
+__device__ float dmcn_im2col_bilinear_cuda(const float *bottom_data, const int data_width,
+                                      const int height, const int width, float h, float w)
+{
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  float lh = h - h_low;
+  float lw = w - w_low;
+  float hh = 1 - lh, hw = 1 - lw;
+
+  float v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  float v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  float v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  float v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+__device__ float dmcn_get_gradient_weight_cuda(float argmax_h, float argmax_w,
+                                          const int h, const int w, const int height, const int width)
+{
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  float weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+__device__ float dmcn_get_coordinate_weight_cuda(float argmax_h, float argmax_w,
+                                            const int height, const int width, const float *im_data,
+                                            const int data_width, const int bp_dir)
+{
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  float weight = 0;
+
+  if (bp_dir == 0)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+  else if (bp_dir == 1)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+__global__ void modulated_deformable_im2col_gpu_kernel(const int n,
+                                                       const float *data_im, const float *data_offset, const float *data_mask,
+                                                       const int height, const int width, const int kernel_h, const int kernel_w,
+                                                       const int pad_h, const int pad_w,
+                                                       const int stride_h, const int stride_w,
+                                                       const int dilation_h, const int dilation_w,
+                                                       const int channel_per_deformable_group,
+                                                       const int batch_size, const int num_channels, const int deformable_group,
+                                                       const int height_col, const int width_col,
+                                                       float *data_col)
+{
+  // launch channels * batch_size * height_col * width_col cores
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow)
+    // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis
+
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    // const int b_col = (index / width_col / height_col) % batch_size;
+    const int b_col = (index / width_col / height_col / num_channels) % batch_size;
+    // const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_im = (index / width_col / height_col) % num_channels;
+    // const int c_col = c_im * kernel_h * kernel_w;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    //  float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col;
+    //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
+    const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
+    const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+
+    const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i)
+    {
+      for (int j = 0; j < kernel_w; ++j)
+      {
+        const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+        const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const float offset_h = data_offset_ptr[data_offset_h_ptr];
+        const float offset_w = data_offset_ptr[data_offset_w_ptr];
+        const float mask = data_mask_ptr[data_mask_hw_ptr];
+        float val = static_cast<float>(0);
+        const float h_im = h_in + i * dilation_h + offset_h;
+        const float w_im = w_in + j * dilation_w + offset_w;
+        //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+        {
+          //const float map_h = i * dilation_h + offset_h;
+          //const float map_w = j * dilation_w + offset_w;
+          //const int cur_height = height - h_in;
+          //const int cur_width = width - w_in;
+          //val = dmcn_im2col_bilinear_cuda(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
+          val = dmcn_im2col_bilinear_cuda(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        // data_col_ptr += batch_size * height_col * width_col;
+        data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+__global__ void modulated_deformable_col2im_gpu_kernel(const int n,
+                                                       const float *data_col, const float *data_offset, const float *data_mask,
+                                                       const int channels, const int height, const int width,
+                                                       const int kernel_h, const int kernel_w,
+                                                       const int pad_h, const int pad_w,
+                                                       const int stride_h, const int stride_w,
+                                                       const int dilation_h, const int dilation_w,
+                                                       const int channel_per_deformable_group,
+                                                       const int batch_size, const int deformable_group,
+                                                       const int height_col, const int width_col,
+                                                       float *grad_im)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+    const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const float offset_h = data_offset_ptr[data_offset_h_ptr];
+    const float offset_w = data_offset_ptr[data_offset_w_ptr];
+    const float mask = data_mask_ptr[data_mask_hw_ptr];
+    const float cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const float cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const float cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++)
+    {
+      for (int dx = -2; dx <= 2; dx++)
+      {
+        if (cur_h + dy >= 0 && cur_h + dy < height &&
+            cur_w + dx >= 0 && cur_w + dx < width &&
+            abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1)
+        {
+          int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          float weight = dmcn_get_gradient_weight_cuda(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n,
+                                                             const float *data_col, const float *data_im,
+                                                             const float *data_offset, const float *data_mask,
+                                                             const int channels, const int height, const int width,
+                                                             const int kernel_h, const int kernel_w,
+                                                             const int pad_h, const int pad_w,
+                                                             const int stride_h, const int stride_w,
+                                                             const int dilation_h, const int dilation_w,
+                                                             const int channel_per_deformable_group,
+                                                             const int batch_size, const int offset_channels, const int deformable_group,
+                                                             const int height_col, const int width_col,
+                                                             float *grad_offset, float *grad_mask)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    float val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
+    const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+    const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
+    {
+      const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
+      const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const float offset_h = data_offset_ptr[data_offset_h_ptr];
+      const float offset_w = data_offset_ptr[data_offset_w_ptr];
+      const float mask = data_mask_ptr[data_mask_hw_ptr];
+      float inv_h = h_in + i * dilation_h + offset_h;
+      float inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+      {
+        inv_h = inv_w = -2;
+      }
+      else
+      {
+        mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear_cuda(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
+      }
+      const float weight = dmcn_get_coordinate_weight_cuda(
+          inv_h, inv_w,
+          height, width, data_im_ptr + cnt * height * width, width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
+  }
+}
+
+void modulated_deformable_im2col_cuda(cudaStream_t stream,
+  const float* data_im, const float* data_offset, const float* data_mask,
+  const int batch_size, const int channels, const int height_im, const int width_im, 
+  const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+  const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
+  const int dilation_h, const int dilation_w,
+  const int deformable_group, float* data_col) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+  modulated_deformable_im2col_gpu_kernel
+      <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
+          0, stream>>>(
+      num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w,
+      pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+      batch_size, channels, deformable_group, height_col, width_col, data_col);
+  
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
+
+void modulated_deformable_col2im_cuda(cudaStream_t stream,
+  const float* data_col, const float* data_offset, const float* data_mask,
+  const int batch_size, const int channels, const int height_im, const int width_im, 
+  const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+  const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
+  const int dilation_h, const int dilation_w, 
+  const int deformable_group, float* grad_im){
+
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+  modulated_deformable_col2im_gpu_kernel
+      <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
+          0, stream>>>(
+        num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im,
+        kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
+        dilation_h, dilation_w, channel_per_deformable_group,
+        batch_size, deformable_group, height_col, width_col, grad_im);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
+
+void modulated_deformable_col2im_coord_cuda(cudaStream_t stream,
+  const float* data_col, const float* data_im, const float* data_offset, const float* data_mask,
+  const int batch_size, const int channels, const int height_im, const int width_im, 
+  const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+  const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
+  const int dilation_h, const int dilation_w, 
+  const int deformable_group,
+  float* grad_offset, float* grad_mask) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
+  const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
+  modulated_deformable_col2im_coord_gpu_kernel
+      <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
+        0, stream>>>(
+        num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im,
+        kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, channel_per_deformable_group,
+        batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, 
+        grad_offset, grad_mask);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_im2col_cuda.h b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_im2col_cuda.h
new file mode 100644
index 00000000..c8568319
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_im2col_cuda.h
@@ -0,0 +1,101 @@
+
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.h
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1811.11168
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu
+ */
+
+/***************** Adapted by Charles Shang *********************/
+
+#ifndef DCN_V2_IM2COL_CUDA
+#define DCN_V2_IM2COL_CUDA
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+  void modulated_deformable_im2col_cuda(cudaStream_t stream,
+                                        const float *data_im, const float *data_offset, const float *data_mask,
+                                        const int batch_size, const int channels, const int height_im, const int width_im,
+                                        const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+                                        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+                                        const int dilation_h, const int dilation_w,
+                                        const int deformable_group, float *data_col);
+
+  void modulated_deformable_col2im_cuda(cudaStream_t stream,
+                                        const float *data_col, const float *data_offset, const float *data_mask,
+                                        const int batch_size, const int channels, const int height_im, const int width_im,
+                                        const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+                                        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+                                        const int dilation_h, const int dilation_w,
+                                        const int deformable_group, float *grad_im);
+
+  void modulated_deformable_col2im_coord_cuda(cudaStream_t stream,
+                                         const float *data_col, const float *data_im, const float *data_offset, const float *data_mask,
+                                         const int batch_size, const int channels, const int height_im, const int width_im,
+                                         const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+                                         const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+                                         const int dilation_h, const int dilation_w,
+                                         const int deformable_group,
+                                         float *grad_offset, float *grad_mask);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_psroi_pooling_cuda.cu b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_psroi_pooling_cuda.cu
new file mode 100644
index 00000000..8f08f6a3
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/dcn_v2_psroi_pooling_cuda.cu
@@ -0,0 +1,419 @@
+/*!
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file deformable_psroi_pooling.cu
+ * \brief
+ * \author Yi Li, Guodong Zhang, Jifeng Dai
+*/
+/***************** Adapted by Charles Shang *********************/
+
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THC.h>
+#include <THC/THCAtomics.cuh>
+#include <THC/THCDeviceUtils.cuh>
+
+#define CUDA_KERNEL_LOOP(i, n)                        \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+       i < (n);                                       \
+       i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N)
+{
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+template <typename T>
+__device__ T bilinear_interp_cuda(
+    const T *data,
+    const T x,
+    const T y,
+    const int width,
+    const int height)
+{
+  int x1 = floor(x);
+  int x2 = ceil(x);
+  int y1 = floor(y);
+  int y2 = ceil(y);
+  T dist_x = static_cast<T>(x - x1);
+  T dist_y = static_cast<T>(y - y1);
+  T value11 = data[y1 * width + x1];
+  T value12 = data[y2 * width + x1];
+  T value21 = data[y1 * width + x2];
+  T value22 = data[y2 * width + x2];
+  T value = (1 - dist_x) * (1 - dist_y) * value11 +
+            (1 - dist_x) * dist_y * value12 +
+            dist_x * (1 - dist_y) * value21 +
+            dist_x * dist_y * value22;
+  return value;
+}
+
+template <typename T>
+__global__ void DeformablePSROIPoolForwardKernelCuda(
+    const int count,
+    const T *bottom_data,
+    const T spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const T *bottom_rois, const T *bottom_trans,
+    const int no_trans,
+    const T trans_std,
+    const int sample_per_part,
+    const int output_dim,
+    const int group_size,
+    const int part_size,
+    const int num_classes,
+    const int channels_each_class,
+    T *top_data,
+    T *top_count)
+{
+  CUDA_KERNEL_LOOP(index, count)
+  {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const T *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+    // Force too small ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
+    T roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+    // Compute w and h at bottom
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
+    T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
+
+    int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
+    int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
+    T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
+
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    T sum = 0;
+    int count = 0;
+    int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
+    int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
+    gw = min(max(gw, 0), group_size - 1);
+    gh = min(max(gh, 0), group_size - 1);
+
+    const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
+    for (int ih = 0; ih < sample_per_part; ih++)
+    {
+      for (int iw = 0; iw < sample_per_part; iw++)
+      {
+        T w = wstart + iw * sub_bin_size_w;
+        T h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+        {
+          continue;
+        }
+        w = min(max(w, 0.), width - 1.);
+        h = min(max(h, 0.), height - 1.);
+        int c = (ctop * group_size + gh) * group_size + gw;
+        T val = bilinear_interp_cuda(offset_bottom_data + c * height * width, w, h, width, height);
+        sum += val;
+        count++;
+      }
+    }
+    top_data[index] = count == 0 ? static_cast<T>(0) : sum / count;
+    top_count[index] = count;
+  }
+}
+
+template <typename T>
+__global__ void DeformablePSROIPoolBackwardAccKernelCuda(
+    const int count,
+    const T *top_diff,
+    const T *top_count,
+    const int num_rois,
+    const T spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const int output_dim,
+    T *bottom_data_diff, T *bottom_trans_diff,
+    const T *bottom_data,
+    const T *bottom_rois,
+    const T *bottom_trans,
+    const int no_trans,
+    const T trans_std,
+    const int sample_per_part,
+    const int group_size,
+    const int part_size,
+    const int num_classes,
+    const int channels_each_class)
+{
+  CUDA_KERNEL_LOOP(index, count)
+  {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const T *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+    // Force too small ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
+    T roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+    // Compute w and h at bottom
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
+    T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
+
+    int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
+    int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
+    T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
+
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    if (top_count[index] <= 0)
+    {
+      continue;
+    }
+    T diff_val = top_diff[index] / top_count[index];
+    const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
+    T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
+    int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
+    int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
+    gw = min(max(gw, 0), group_size - 1);
+    gh = min(max(gh, 0), group_size - 1);
+
+    for (int ih = 0; ih < sample_per_part; ih++)
+    {
+      for (int iw = 0; iw < sample_per_part; iw++)
+      {
+        T w = wstart + iw * sub_bin_size_w;
+        T h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+        {
+          continue;
+        }
+        w = min(max(w, 0.), width - 1.);
+        h = min(max(h, 0.), height - 1.);
+        int c = (ctop * group_size + gh) * group_size + gw;
+        // backward on feature
+        int x0 = floor(w);
+        int x1 = ceil(w);
+        int y0 = floor(h);
+        int y1 = ceil(h);
+        T dist_x = w - x0, dist_y = h - y0;
+        T q00 = (1 - dist_x) * (1 - dist_y);
+        T q01 = (1 - dist_x) * dist_y;
+        T q10 = dist_x * (1 - dist_y);
+        T q11 = dist_x * dist_y;
+        int bottom_index_base = c * height * width;
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);
+
+        if (no_trans)
+        {
+          continue;
+        }
+        T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
+        T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
+        T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
+        T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
+        T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
+        diff_x *= roi_width;
+        T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
+        diff_y *= roi_height;
+
+        atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
+        atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);
+      }
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input,
+                                  const at::Tensor &bbox,
+                                  const at::Tensor &trans,
+                                  const int no_trans,
+                                  const float spatial_scale,
+                                  const int output_dim,
+                                  const int group_size,
+                                  const int pooled_size,
+                                  const int part_size,
+                                  const int sample_per_part,
+                                  const float trans_std)
+{
+  AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor");
+  AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+  const int num_bbox = bbox.size(0);
+
+  AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
+  auto pooled_height = pooled_size;
+  auto pooled_width = pooled_size;
+
+  auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
+  long out_size = num_bbox * output_dim * pooled_height * pooled_width;
+  auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
+
+  const int num_classes = no_trans ? 1 : channels_trans / 2;
+  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (out.numel() == 0)
+  {
+    THCudaCheck(cudaGetLastError());
+    return std::make_tuple(out, top_count);
+  }
+
+  dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
+  dim3 block(512);
+
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cuda_forward", [&] {
+    DeformablePSROIPoolForwardKernelCuda<scalar_t><<<grid, block, 0, stream>>>(
+        out_size,
+        input.contiguous().data<scalar_t>(),
+        spatial_scale,
+        channels,
+        height, width,
+        pooled_height,
+        pooled_width,
+        bbox.contiguous().data<scalar_t>(),
+        trans.contiguous().data<scalar_t>(),
+        no_trans,
+        trans_std,
+        sample_per_part,
+        output_dim,
+        group_size,
+        part_size,
+        num_classes,
+        channels_each_class,
+        out.data<scalar_t>(),
+        top_count.data<scalar_t>());
+  });
+  THCudaCheck(cudaGetLastError());
+  return std::make_tuple(out, top_count);
+}
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad,
+                                   const at::Tensor &input,
+                                   const at::Tensor &bbox,
+                                   const at::Tensor &trans,
+                                   const at::Tensor &top_count,
+                                   const int no_trans,
+                                   const float spatial_scale,
+                                   const int output_dim,
+                                   const int group_size,
+                                   const int pooled_size,
+                                   const int part_size,
+                                   const int sample_per_part,
+                                   const float trans_std)
+{
+  AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor");
+  AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor");
+  AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");
+  AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+  const int num_bbox = bbox.size(0);
+
+  AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
+  auto pooled_height = pooled_size;
+  auto pooled_width = pooled_size;
+  long out_size = num_bbox * output_dim * pooled_height * pooled_width;
+  const int num_classes = no_trans ? 1 : channels_trans / 2;
+  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+  auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options());
+  auto trans_grad = at::zeros_like(trans);
+
+  if (input_grad.numel() == 0)
+  {
+    THCudaCheck(cudaGetLastError());
+    return std::make_tuple(input_grad, trans_grad);
+  }
+
+  dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
+  dim3 block(512);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cuda_backward", [&] {
+    DeformablePSROIPoolBackwardAccKernelCuda<scalar_t><<<grid, block, 0, stream>>>(
+        out_size,
+        out_grad.contiguous().data<scalar_t>(),
+        top_count.contiguous().data<scalar_t>(),
+        num_bbox,
+        spatial_scale,
+        channels,
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        output_dim,
+        input_grad.contiguous().data<scalar_t>(),
+        trans_grad.contiguous().data<scalar_t>(),
+        input.contiguous().data<scalar_t>(),
+        bbox.contiguous().data<scalar_t>(),
+        trans.contiguous().data<scalar_t>(),
+        no_trans,
+        trans_std,
+        sample_per_part,
+        group_size,
+        part_size,
+        num_classes,
+        channels_each_class);
+  });
+  THCudaCheck(cudaGetLastError());
+  return std::make_tuple(input_grad, trans_grad);
+}
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/vision.h b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/vision.h
new file mode 100644
index 00000000..e42a2a79
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/cuda/vision.h
@@ -0,0 +1,60 @@
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor
+dcn_v2_cuda_forward(const at::Tensor &input,
+                    const at::Tensor &weight,
+                    const at::Tensor &bias,
+                    const at::Tensor &offset,
+                    const at::Tensor &mask,
+                    const int kernel_h,
+                    const int kernel_w,
+                    const int stride_h,
+                    const int stride_w,
+                    const int pad_h,
+                    const int pad_w,
+                    const int dilation_h,
+                    const int dilation_w,
+                    const int deformable_group);
+
+std::vector<at::Tensor>
+dcn_v2_cuda_backward(const at::Tensor &input,
+                     const at::Tensor &weight,
+                     const at::Tensor &bias,
+                     const at::Tensor &offset,
+                     const at::Tensor &mask,
+                     const at::Tensor &grad_output,
+                     int kernel_h, int kernel_w,
+                     int stride_h, int stride_w,
+                     int pad_h, int pad_w,
+                     int dilation_h, int dilation_w,
+                     int deformable_group);
+
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input,
+                                  const at::Tensor &bbox,
+                                  const at::Tensor &trans,
+                                  const int no_trans,
+                                  const float spatial_scale,
+                                  const int output_dim,
+                                  const int group_size,
+                                  const int pooled_size,
+                                  const int part_size,
+                                  const int sample_per_part,
+                                  const float trans_std);
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad,
+                                   const at::Tensor &input,
+                                   const at::Tensor &bbox,
+                                   const at::Tensor &trans,
+                                   const at::Tensor &top_count,
+                                   const int no_trans,
+                                   const float spatial_scale,
+                                   const int output_dim,
+                                   const int group_size,
+                                   const int pooled_size,
+                                   const int part_size,
+                                   const int sample_per_part,
+                                   const float trans_std);
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/dcn_v2.h b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/dcn_v2.h
new file mode 100644
index 00000000..de670bf9
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/dcn_v2.h
@@ -0,0 +1,190 @@
+#pragma once
+
+#include "cpu/vision.h"
+
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+#endif
+
+at::Tensor
+dcn_v2_forward(const at::Tensor &input,
+               const at::Tensor &weight,
+               const at::Tensor &bias,
+               const at::Tensor &offset,
+               const at::Tensor &mask,
+               const int kernel_h,
+               const int kernel_w,
+               const int stride_h,
+               const int stride_w,
+               const int pad_h,
+               const int pad_w,
+               const int dilation_h,
+               const int dilation_w,
+               const int deformable_group)
+{
+    if (input.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return dcn_v2_cuda_forward(input, weight, bias, offset, mask,
+                                   kernel_h, kernel_w,
+                                   stride_h, stride_w,
+                                   pad_h, pad_w,
+                                   dilation_h, dilation_w,
+                                   deformable_group);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    else{
+        return dcn_v2_cpu_forward(input, weight, bias, offset, mask,
+                                   kernel_h, kernel_w,
+                                   stride_h, stride_w,
+                                   pad_h, pad_w,
+                                   dilation_h, dilation_w,
+                                   deformable_group);
+    }
+}
+
+std::vector<at::Tensor>
+dcn_v2_backward(const at::Tensor &input,
+                const at::Tensor &weight,
+                const at::Tensor &bias,
+                const at::Tensor &offset,
+                const at::Tensor &mask,
+                const at::Tensor &grad_output,
+                int kernel_h, int kernel_w,
+                int stride_h, int stride_w,
+                int pad_h, int pad_w,
+                int dilation_h, int dilation_w,
+                int deformable_group)
+{
+    if (input.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return dcn_v2_cuda_backward(input,
+                                    weight,
+                                    bias,
+                                    offset,
+                                    mask,
+                                    grad_output,
+                                    kernel_h, kernel_w,
+                                    stride_h, stride_w,
+                                    pad_h, pad_w,
+                                    dilation_h, dilation_w,
+                                    deformable_group);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    else{
+        return dcn_v2_cpu_backward(input,
+                                    weight,
+                                    bias,
+                                    offset,
+                                    mask,
+                                    grad_output,
+                                    kernel_h, kernel_w,
+                                    stride_h, stride_w,
+                                    pad_h, pad_w,
+                                    dilation_h, dilation_w,
+                                    deformable_group);
+    }
+}
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_forward(const at::Tensor &input,
+                             const at::Tensor &bbox,
+                             const at::Tensor &trans,
+                             const int no_trans,
+                             const float spatial_scale,
+                             const int output_dim,
+                             const int group_size,
+                             const int pooled_size,
+                             const int part_size,
+                             const int sample_per_part,
+                             const float trans_std)
+{
+    if (input.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return dcn_v2_psroi_pooling_cuda_forward(input,
+                                                 bbox,
+                                                 trans,
+                                                 no_trans,
+                                                 spatial_scale,
+                                                 output_dim,
+                                                 group_size,
+                                                 pooled_size,
+                                                 part_size,
+                                                 sample_per_part,
+                                                 trans_std);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    else{
+        return dcn_v2_psroi_pooling_cpu_forward(input,
+                                                 bbox,
+                                                 trans,
+                                                 no_trans,
+                                                 spatial_scale,
+                                                 output_dim,
+                                                 group_size,
+                                                 pooled_size,
+                                                 part_size,
+                                                 sample_per_part,
+                                                 trans_std);
+    }
+}
+
+std::tuple<at::Tensor, at::Tensor>
+dcn_v2_psroi_pooling_backward(const at::Tensor &out_grad,
+                              const at::Tensor &input,
+                              const at::Tensor &bbox,
+                              const at::Tensor &trans,
+                              const at::Tensor &top_count,
+                              const int no_trans,
+                              const float spatial_scale,
+                              const int output_dim,
+                              const int group_size,
+                              const int pooled_size,
+                              const int part_size,
+                              const int sample_per_part,
+                              const float trans_std)
+{
+    if (input.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return dcn_v2_psroi_pooling_cuda_backward(out_grad,
+                                                  input,
+                                                  bbox,
+                                                  trans,
+                                                  top_count,
+                                                  no_trans,
+                                                  spatial_scale,
+                                                  output_dim,
+                                                  group_size,
+                                                  pooled_size,
+                                                  part_size,
+                                                  sample_per_part,
+                                                  trans_std);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    else{
+        return dcn_v2_psroi_pooling_cpu_backward(out_grad,
+                                                  input,
+                                                  bbox,
+                                                  trans,
+                                                  top_count,
+                                                  no_trans,
+                                                  spatial_scale,
+                                                  output_dim,
+                                                  group_size,
+                                                  pooled_size,
+                                                  part_size,
+                                                  sample_per_part,
+                                                  trans_std);
+    }
+}
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/vision.cpp b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/vision.cpp
new file mode 100644
index 00000000..ff54233e
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/src/vision.cpp
@@ -0,0 +1,9 @@
+
+#include "dcn_v2.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("dcn_v2_forward", &dcn_v2_forward, "dcn_v2_forward");
+  m.def("dcn_v2_backward", &dcn_v2_backward, "dcn_v2_backward");
+  m.def("dcn_v2_psroi_pooling_forward", &dcn_v2_psroi_pooling_forward, "dcn_v2_psroi_pooling_forward");
+  m.def("dcn_v2_psroi_pooling_backward", &dcn_v2_psroi_pooling_backward, "dcn_v2_psroi_pooling_backward");
+}
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/testcpu.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/testcpu.py
new file mode 100644
index 00000000..0a0b36eb
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/testcpu.py
@@ -0,0 +1,270 @@
+#!/usr/bin/env python
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import time
+import torch
+import torch.nn as nn
+from torch.autograd import gradcheck
+
+from dcn_v2 import dcn_v2_conv, DCNv2, DCN
+from dcn_v2 import dcn_v2_pooling, DCNv2Pooling, DCNPooling
+
+deformable_groups = 1
+N, inC, inH, inW = 2, 2, 4, 4
+outC = 2
+kH, kW = 3, 3
+
+
+def conv_identify(weight, bias):
+    weight.data.zero_()
+    bias.data.zero_()
+    o, i, h, w = weight.shape
+    y = h//2
+    x = w//2
+    for p in range(i):
+        for q in range(o):
+            if p == q:
+                weight.data[q, p, y, x] = 1.0
+
+
+def check_zero_offset():
+    conv_offset = nn.Conv2d(inC, deformable_groups * 2 * kH * kW,
+                            kernel_size=(kH, kW),
+                            stride=(1, 1),
+                            padding=(1, 1),
+                            bias=True)
+
+    conv_mask = nn.Conv2d(inC, deformable_groups * 1 * kH * kW,
+                          kernel_size=(kH, kW),
+                          stride=(1, 1),
+                          padding=(1, 1),
+                          bias=True)
+
+    dcn_v2 = DCNv2(inC, outC, (kH, kW),
+                   stride=1, padding=1, dilation=1,
+                   deformable_groups=deformable_groups)
+
+    conv_offset.weight.data.zero_()
+    conv_offset.bias.data.zero_()
+    conv_mask.weight.data.zero_()
+    conv_mask.bias.data.zero_()
+    conv_identify(dcn_v2.weight, dcn_v2.bias)
+
+    input = torch.randn(N, inC, inH, inW)
+    offset = conv_offset(input)
+    mask = conv_mask(input)
+    mask = torch.sigmoid(mask)
+    output = dcn_v2(input, offset, mask)
+    output *= 2
+    d = (input - output).abs().max()
+    if d < 1e-10:
+        print('Zero offset passed')
+    else:
+        print('Zero offset failed')
+        print(input)
+        print(output)
+
+def check_gradient_dconv():
+
+    input = torch.rand(N, inC, inH, inW) * 0.01
+    input.requires_grad = True
+
+    offset = torch.randn(N, deformable_groups * 2 * kW * kH, inH, inW) * 2
+    # offset.data.zero_()
+    # offset.data -= 0.5
+    offset.requires_grad = True
+
+    mask = torch.rand(N, deformable_groups * 1 * kW * kH, inH, inW)
+    # mask.data.zero_()
+    mask.requires_grad = True
+    mask = torch.sigmoid(mask)
+
+    weight = torch.randn(outC, inC, kH, kW)
+    weight.requires_grad = True
+
+    bias = torch.rand(outC)
+    bias.requires_grad = True
+
+    stride = 1
+    padding = 1
+    dilation = 1
+
+    print('check_gradient_dconv: ',
+          gradcheck(dcn_v2_conv, (input, offset, mask, weight, bias,
+                    stride, padding, dilation, deformable_groups),
+                    eps=1e-3, atol=1e-4, rtol=1e-2))
+
+
+def check_pooling_zero_offset():
+
+    input = torch.randn(2, 16, 64, 64).zero_()
+    input[0, :, 16:26, 16:26] = 1.
+    input[1, :, 10:20, 20:30] = 2.
+    rois = torch.tensor([
+        [0, 65, 65, 103, 103],
+        [1, 81, 41, 119, 79],
+    ]).float()
+    pooling = DCNv2Pooling(spatial_scale=1.0 / 4,
+                           pooled_size=7,
+                           output_dim=16,
+                           no_trans=True,
+                           group_size=1,
+                           trans_std=0.0)
+
+    out = pooling(input, rois, input.new())
+    s = ', '.join(['%f' % out[i, :, :, :].mean().item()
+                   for i in range(rois.shape[0])])
+    print(s)
+
+    dpooling = DCNv2Pooling(spatial_scale=1.0 / 4,
+                            pooled_size=7,
+                            output_dim=16,
+                            no_trans=False,
+                            group_size=1,
+                            trans_std=0.0)
+    offset = torch.randn(20, 2, 7, 7).zero_()
+    dout = dpooling(input, rois, offset)
+    s = ', '.join(['%f' % dout[i, :, :, :].mean().item()
+                   for i in range(rois.shape[0])])
+    print(s)
+
+
+def check_gradient_dpooling():
+    input = torch.randn(2, 3, 5, 5) * 0.01
+    N = 4
+    batch_inds = torch.randint(2, (N, 1)).float()
+    x = torch.rand((N, 1)).float() * 15
+    y = torch.rand((N, 1)).float() * 15
+    w = torch.rand((N, 1)).float() * 10
+    h = torch.rand((N, 1)).float() * 10
+    rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
+    offset = torch.randn(N, 2, 3, 3)
+    input.requires_grad = True
+    offset.requires_grad = True
+
+    spatial_scale = 1.0 / 4
+    pooled_size = 3
+    output_dim = 3
+    no_trans = 0
+    group_size = 1
+    trans_std = 0.0
+    sample_per_part = 4
+    part_size = pooled_size
+
+    print('check_gradient_dpooling:',
+          gradcheck(dcn_v2_pooling, (input, rois, offset,
+                                     spatial_scale,
+                                     pooled_size,
+                                     output_dim,
+                                     no_trans,
+                                     group_size,
+                                     part_size,
+                                     sample_per_part,
+                                     trans_std),
+                    eps=1e-4))
+
+
+def example_dconv():
+    input = torch.randn(2, 64, 128, 128)
+    # wrap all things (offset and mask) in DCN
+    dcn = DCN(64, 64, kernel_size=(3, 3), stride=1,
+              padding=1, deformable_groups=2)
+    # print(dcn.weight.shape, input.shape)
+    output = dcn(input)
+    targert = output.new(*output.size())
+    targert.data.uniform_(-0.01, 0.01)
+    error = (targert - output).mean()
+    error.backward()
+    print(output.shape)
+
+
+def example_dpooling():
+    input = torch.randn(2, 32, 64, 64)
+    batch_inds = torch.randint(2, (20, 1)).float()
+    x = torch.randint(256, (20, 1)).float()
+    y = torch.randint(256, (20, 1)).float()
+    w = torch.randint(64, (20, 1)).float()
+    h = torch.randint(64, (20, 1)).float()
+    rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
+    offset = torch.randn(20, 2, 7, 7)
+    input.requires_grad = True
+    offset.requires_grad = True
+
+    # normal roi_align
+    pooling = DCNv2Pooling(spatial_scale=1.0 / 4,
+                           pooled_size=7,
+                           output_dim=32,
+                           no_trans=True,
+                           group_size=1,
+                           trans_std=0.1)
+
+    # deformable pooling
+    dpooling = DCNv2Pooling(spatial_scale=1.0 / 4,
+                            pooled_size=7,
+                            output_dim=32,
+                            no_trans=False,
+                            group_size=1,
+                            trans_std=0.1)
+
+    out = pooling(input, rois, offset)
+    dout = dpooling(input, rois, offset)
+    print(out.shape)
+    print(dout.shape)
+
+    target_out = out.new(*out.size())
+    target_out.data.uniform_(-0.01, 0.01)
+    target_dout = dout.new(*dout.size())
+    target_dout.data.uniform_(-0.01, 0.01)
+    e = (target_out - out).mean()
+    e.backward()
+    e = (target_dout - dout).mean()
+    e.backward()
+
+
+def example_mdpooling():
+    input = torch.randn(2, 32, 64, 64)
+    input.requires_grad = True
+    batch_inds = torch.randint(2, (20, 1)).float()
+    x = torch.randint(256, (20, 1)).float()
+    y = torch.randint(256, (20, 1)).float()
+    w = torch.randint(64, (20, 1)).float()
+    h = torch.randint(64, (20, 1)).float()
+    rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
+
+    # mdformable pooling (V2)
+    dpooling = DCNPooling(spatial_scale=1.0 / 4,
+                          pooled_size=7,
+                          output_dim=32,
+                          no_trans=False,
+                          group_size=1,
+                          trans_std=0.1,
+                          deform_fc_dim=1024)
+
+    dout = dpooling(input, rois)
+    target = dout.new(*dout.size())
+    target.data.uniform_(-0.1, 0.1)
+    error = (target - dout).mean()
+    error.backward()
+    print(dout.shape)
+
+
+if __name__ == '__main__':
+
+    example_dconv()
+    example_dpooling()
+    example_mdpooling()
+
+    check_pooling_zero_offset()
+    # zero offset check
+    if inC == outC:
+        check_zero_offset()
+
+    check_gradient_dpooling()
+    check_gradient_dconv()
+    # """
+    # ****** Note: backward is not reentrant error may not be a serious problem,
+    # ****** since the max error is less than 1e-7,
+    # ****** Still looking for what trigger this problem
+    # """
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/testcuda.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/testcuda.py
new file mode 100644
index 00000000..3bd5bd22
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/DCNv2_1.4/testcuda.py
@@ -0,0 +1,270 @@
+#!/usr/bin/env python
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import time
+import torch
+import torch.nn as nn
+from torch.autograd import gradcheck
+
+from dcn_v2 import dcn_v2_conv, DCNv2, DCN
+from dcn_v2 import dcn_v2_pooling, DCNv2Pooling, DCNPooling
+
+deformable_groups = 1
+N, inC, inH, inW = 2, 2, 4, 4
+outC = 2
+kH, kW = 3, 3
+
+
+def conv_identify(weight, bias):
+    weight.data.zero_()
+    bias.data.zero_()
+    o, i, h, w = weight.shape
+    y = h//2
+    x = w//2
+    for p in range(i):
+        for q in range(o):
+            if p == q:
+                weight.data[q, p, y, x] = 1.0
+
+
+def check_zero_offset():
+    conv_offset = nn.Conv2d(inC, deformable_groups * 2 * kH * kW,
+                            kernel_size=(kH, kW),
+                            stride=(1, 1),
+                            padding=(1, 1),
+                            bias=True).cuda()
+
+    conv_mask = nn.Conv2d(inC, deformable_groups * 1 * kH * kW,
+                          kernel_size=(kH, kW),
+                          stride=(1, 1),
+                          padding=(1, 1),
+                          bias=True).cuda()
+
+    dcn_v2 = DCNv2(inC, outC, (kH, kW),
+                   stride=1, padding=1, dilation=1,
+                   deformable_groups=deformable_groups).cuda()
+
+    conv_offset.weight.data.zero_()
+    conv_offset.bias.data.zero_()
+    conv_mask.weight.data.zero_()
+    conv_mask.bias.data.zero_()
+    conv_identify(dcn_v2.weight, dcn_v2.bias)
+
+    input = torch.randn(N, inC, inH, inW).cuda()
+    offset = conv_offset(input)
+    mask = conv_mask(input)
+    mask = torch.sigmoid(mask)
+    output = dcn_v2(input, offset, mask)
+    output *= 2
+    d = (input - output).abs().max()
+    if d < 1e-10:
+        print('Zero offset passed')
+    else:
+        print('Zero offset failed')
+        print(input)
+        print(output)
+
+def check_gradient_dconv():
+
+    input = torch.rand(N, inC, inH, inW).cuda() * 0.01
+    input.requires_grad = True
+
+    offset = torch.randn(N, deformable_groups * 2 * kW * kH, inH, inW).cuda() * 2
+    # offset.data.zero_()
+    # offset.data -= 0.5
+    offset.requires_grad = True
+
+    mask = torch.rand(N, deformable_groups * 1 * kW * kH, inH, inW).cuda()
+    # mask.data.zero_()
+    mask.requires_grad = True
+    mask = torch.sigmoid(mask)
+
+    weight = torch.randn(outC, inC, kH, kW).cuda()
+    weight.requires_grad = True
+
+    bias = torch.rand(outC).cuda()
+    bias.requires_grad = True
+
+    stride = 1
+    padding = 1
+    dilation = 1
+
+    print('check_gradient_dconv: ',
+          gradcheck(dcn_v2_conv, (input, offset, mask, weight, bias,
+                    stride, padding, dilation, deformable_groups),
+                    eps=1e-3, atol=1e-4, rtol=1e-2))
+
+
+def check_pooling_zero_offset():
+
+    input = torch.randn(2, 16, 64, 64).cuda().zero_()
+    input[0, :, 16:26, 16:26] = 1.
+    input[1, :, 10:20, 20:30] = 2.
+    rois = torch.tensor([
+        [0, 65, 65, 103, 103],
+        [1, 81, 41, 119, 79],
+    ]).cuda().float()
+    pooling = DCNv2Pooling(spatial_scale=1.0 / 4,
+                           pooled_size=7,
+                           output_dim=16,
+                           no_trans=True,
+                           group_size=1,
+                           trans_std=0.0).cuda()
+
+    out = pooling(input, rois, input.new())
+    s = ', '.join(['%f' % out[i, :, :, :].mean().item()
+                   for i in range(rois.shape[0])])
+    print(s)
+
+    dpooling = DCNv2Pooling(spatial_scale=1.0 / 4,
+                            pooled_size=7,
+                            output_dim=16,
+                            no_trans=False,
+                            group_size=1,
+                            trans_std=0.0).cuda()
+    offset = torch.randn(20, 2, 7, 7).cuda().zero_()
+    dout = dpooling(input, rois, offset)
+    s = ', '.join(['%f' % dout[i, :, :, :].mean().item()
+                   for i in range(rois.shape[0])])
+    print(s)
+
+
+def check_gradient_dpooling():
+    input = torch.randn(2, 3, 5, 5).cuda() * 0.01
+    N = 4
+    batch_inds = torch.randint(2, (N, 1)).cuda().float()
+    x = torch.rand((N, 1)).cuda().float() * 15
+    y = torch.rand((N, 1)).cuda().float() * 15
+    w = torch.rand((N, 1)).cuda().float() * 10
+    h = torch.rand((N, 1)).cuda().float() * 10
+    rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
+    offset = torch.randn(N, 2, 3, 3).cuda()
+    input.requires_grad = True
+    offset.requires_grad = True
+
+    spatial_scale = 1.0 / 4
+    pooled_size = 3
+    output_dim = 3
+    no_trans = 0
+    group_size = 1
+    trans_std = 0.0
+    sample_per_part = 4
+    part_size = pooled_size
+
+    print('check_gradient_dpooling:',
+          gradcheck(dcn_v2_pooling, (input, rois, offset,
+                                     spatial_scale,
+                                     pooled_size,
+                                     output_dim,
+                                     no_trans,
+                                     group_size,
+                                     part_size,
+                                     sample_per_part,
+                                     trans_std),
+                    eps=1e-4))
+
+
+def example_dconv():
+    input = torch.randn(2, 64, 128, 128).cuda()
+    # wrap all things (offset and mask) in DCN
+    dcn = DCN(64, 64, kernel_size=(3, 3), stride=1,
+              padding=1, deformable_groups=2).cuda()
+    # print(dcn.weight.shape, input.shape)
+    output = dcn(input)
+    targert = output.new(*output.size())
+    targert.data.uniform_(-0.01, 0.01)
+    error = (targert - output).mean()
+    error.backward()
+    print(output.shape)
+
+
+def example_dpooling():
+    input = torch.randn(2, 32, 64, 64).cuda()
+    batch_inds = torch.randint(2, (20, 1)).cuda().float()
+    x = torch.randint(256, (20, 1)).cuda().float()
+    y = torch.randint(256, (20, 1)).cuda().float()
+    w = torch.randint(64, (20, 1)).cuda().float()
+    h = torch.randint(64, (20, 1)).cuda().float()
+    rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
+    offset = torch.randn(20, 2, 7, 7).cuda()
+    input.requires_grad = True
+    offset.requires_grad = True
+
+    # normal roi_align
+    pooling = DCNv2Pooling(spatial_scale=1.0 / 4,
+                           pooled_size=7,
+                           output_dim=32,
+                           no_trans=True,
+                           group_size=1,
+                           trans_std=0.1).cuda()
+
+    # deformable pooling
+    dpooling = DCNv2Pooling(spatial_scale=1.0 / 4,
+                            pooled_size=7,
+                            output_dim=32,
+                            no_trans=False,
+                            group_size=1,
+                            trans_std=0.1).cuda()
+
+    out = pooling(input, rois, offset)
+    dout = dpooling(input, rois, offset)
+    print(out.shape)
+    print(dout.shape)
+
+    target_out = out.new(*out.size())
+    target_out.data.uniform_(-0.01, 0.01)
+    target_dout = dout.new(*dout.size())
+    target_dout.data.uniform_(-0.01, 0.01)
+    e = (target_out - out).mean()
+    e.backward()
+    e = (target_dout - dout).mean()
+    e.backward()
+
+
+def example_mdpooling():
+    input = torch.randn(2, 32, 64, 64).cuda()
+    input.requires_grad = True
+    batch_inds = torch.randint(2, (20, 1)).cuda().float()
+    x = torch.randint(256, (20, 1)).cuda().float()
+    y = torch.randint(256, (20, 1)).cuda().float()
+    w = torch.randint(64, (20, 1)).cuda().float()
+    h = torch.randint(64, (20, 1)).cuda().float()
+    rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
+
+    # mdformable pooling (V2)
+    dpooling = DCNPooling(spatial_scale=1.0 / 4,
+                          pooled_size=7,
+                          output_dim=32,
+                          no_trans=False,
+                          group_size=1,
+                          trans_std=0.1,
+                          deform_fc_dim=1024).cuda()
+
+    dout = dpooling(input, rois)
+    target = dout.new(*dout.size())
+    target.data.uniform_(-0.1, 0.1)
+    error = (target - dout).mean()
+    error.backward()
+    print(dout.shape)
+
+
+if __name__ == '__main__':
+
+    example_dconv()
+    example_dpooling()
+    example_mdpooling()
+
+    check_pooling_zero_offset()
+    # zero offset check
+    if inC == outC:
+        check_zero_offset()
+
+    check_gradient_dpooling()
+    check_gradient_dconv()
+    # """
+    # ****** Note: backward is not reentrant error may not be a serious problem,
+    # ****** since the max error is less than 1e-7,
+    # ****** Still looking for what trigger this problem
+    # """
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/__init__.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/__init__.py
new file mode 100644
index 00000000..165e6372
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/__init__.py
@@ -0,0 +1,13 @@
+from .functions.deform_conv import deform_conv, modulated_deform_conv
+from .functions.deform_pool import deform_roi_pooling
+from .modules.deform_conv import (DeformConv, ModulatedDeformConv,
+                                  DeformConvPack, ModulatedDeformConvPack)
+from .modules.deform_pool import (DeformRoIPooling, DeformRoIPoolingPack,
+                                  ModulatedDeformRoIPoolingPack)
+
+__all__ = [
+    'DeformConv', 'DeformConvPack', 'ModulatedDeformConv',
+    'ModulatedDeformConvPack', 'DeformRoIPooling', 'DeformRoIPoolingPack',
+    'ModulatedDeformRoIPoolingPack', 'deform_conv', 'modulated_deform_conv',
+    'deform_roi_pooling'
+]
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/lib.linux-x86_64-3.6/deform_conv_cuda.cpython-36m-x86_64-linux-gnu.so b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/lib.linux-x86_64-3.6/deform_conv_cuda.cpython-36m-x86_64-linux-gnu.so
new file mode 100755
index 00000000..a77a9e75
Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/lib.linux-x86_64-3.6/deform_conv_cuda.cpython-36m-x86_64-linux-gnu.so differ
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/lib.linux-x86_64-3.6/deform_pool_cuda.cpython-36m-x86_64-linux-gnu.so b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/lib.linux-x86_64-3.6/deform_pool_cuda.cpython-36m-x86_64-linux-gnu.so
new file mode 100755
index 00000000..2e336ed9
Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/lib.linux-x86_64-3.6/deform_pool_cuda.cpython-36m-x86_64-linux-gnu.so differ
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_conv_cuda.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_conv_cuda.o
new file mode 100644
index 00000000..aab12cbd
Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_conv_cuda.o differ
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_conv_cuda_kernel.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_conv_cuda_kernel.o
new file mode 100644
index 00000000..1d23b025
Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_conv_cuda_kernel.o differ
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_pool_cuda.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_pool_cuda.o
new file mode 100644
index 00000000..517d3e89
Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_pool_cuda.o differ
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_pool_cuda_kernel.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_pool_cuda_kernel.o
new file mode 100644
index 00000000..591c2ad2
Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.6/src/deform_pool_cuda_kernel.o differ
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.9/build.ninja b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.9/build.ninja
new file mode 100644
index 00000000..b68dcc8c
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-3.9/build.ninja
@@ -0,0 +1,29 @@
+ninja_required_version = 1.3
+cxx = c++
+nvcc = /usr/local/cuda/bin/nvcc
+
+cflags = -pthread -B /ssd8/exec/huangjy/miniconda3/envs/vgt/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /ssd8/exec/huangjy/miniconda3/envs/vgt/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/include -fPIC -O2 -isystem /ssd8/exec/huangjy/miniconda3/envs/vgt/include -fPIC -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/TH -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/include/python3.9 -c
+post_cflags = -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=deform_conv_cuda -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++14
+cuda_cflags = -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/TH -I/ssd8/exec/huangjy/miniconda3/envs/vgt/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda/include -I/ssd8/exec/huangjy/miniconda3/envs/vgt/include/python3.9 -c
+cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=deform_conv_cuda -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 -std=c++14
+ldflags = 
+
+rule compile
+  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags
+  depfile = $out.d
+  deps = gcc
+
+rule cuda_compile
+  depfile = $out.d
+  deps = gcc
+  command = $nvcc  $cuda_cflags -c $in -o $out $cuda_post_cflags
+
+
+
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/dcn/build/temp.linux-x86_64-3.9/src/deform_conv_cuda.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/dcn/src/deform_conv_cuda.cpp
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/dcn/build/temp.linux-x86_64-3.9/src/deform_conv_cuda_kernel.o: cuda_compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/dcn/src/deform_conv_cuda_kernel.cu
+
+
+
+
+
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/.ninja_log b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/.ninja_log
new file mode 100644
index 00000000..53358401
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/.ninja_log
@@ -0,0 +1,2 @@
+# ninja log v5
+0	7662	1710746646902240006	/ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda_kernel.o	1473fba0a93f3aa9
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/build.ninja b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/build.ninja
new file mode 100644
index 00000000..08980c4b
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/build.ninja
@@ -0,0 +1,27 @@
+ninja_required_version = 1.3
+cxx = c++
+nvcc = /usr/local/cuda/bin/nvcc
+
+cflags = -pthread -B /ssd8/exec/huangjy/miniconda3/envs/lore/compiler_compat -Wl,--sysroot=/ -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/TH -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/THC -I/usr/local/cuda/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/include/python3.7m -c
+post_cflags = -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=deform_conv_cuda -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++14
+cuda_cflags = -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/TH -I/ssd8/exec/huangjy/miniconda3/envs/lore/lib/python3.7/site-packages/torch/include/THC -I/usr/local/cuda/include -I/ssd8/exec/huangjy/miniconda3/envs/lore/include/python3.7m -c
+cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=deform_conv_cuda -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_75,code=sm_75 -std=c++14
+ldflags = 
+
+rule compile
+  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags
+  depfile = $out.d
+  deps = gcc
+
+rule cuda_compile
+  command = $nvcc $cuda_cflags -c $in -o $out $cuda_post_cflags
+
+
+
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda.o: compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/dcn/src/deform_conv_cuda.cpp
+build /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda_kernel.o: cuda_compile /ssd8/exec/huangjy/AdvancedLiterateMachinery/DocumentUnderstanding/LORE-TSR/src/lib/models/networks/dcn/src/deform_conv_cuda_kernel.cu
+
+
+
+
+
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda_kernel.o b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda_kernel.o
new file mode 100644
index 00000000..bb82822a
Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda_kernel.o differ
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/deform_conv_cuda.cpython-36m-x86_64-linux-gnu.so b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/deform_conv_cuda.cpython-36m-x86_64-linux-gnu.so
new file mode 100755
index 00000000..a77a9e75
Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/deform_conv_cuda.cpython-36m-x86_64-linux-gnu.so differ
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/deform_pool_cuda.cpython-36m-x86_64-linux-gnu.so b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/deform_pool_cuda.cpython-36m-x86_64-linux-gnu.so
new file mode 100755
index 00000000..2e336ed9
Binary files /dev/null and b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/deform_pool_cuda.cpython-36m-x86_64-linux-gnu.so differ
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/functions/__init__.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/functions/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/functions/deform_conv.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/functions/deform_conv.py
new file mode 100644
index 00000000..e82b0746
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/functions/deform_conv.py
@@ -0,0 +1,187 @@
+import torch
+from torch.autograd import Function
+from torch.nn.modules.utils import _pair
+
+from .. import deform_conv_cuda
+
+
+class DeformConvFunction(Function):
+
+    @staticmethod
+    def forward(ctx,
+                input,
+                offset,
+                weight,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                deformable_groups=1,
+                im2col_step=64):
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                "Expected 4D tensor as input, got {}D tensor instead.".format(
+                    input.dim()))
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.im2col_step = im2col_step
+
+        ctx.save_for_backward(input, offset, weight)
+
+        output = input.new_empty(
+            DeformConvFunction._output_size(input, weight, ctx.padding,
+                                            ctx.dilation, ctx.stride))
+
+        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
+
+        if not input.is_cuda:
+            raise NotImplementedError
+        else:
+            cur_im2col_step = min(ctx.im2col_step, input.shape[0])
+            assert (input.shape[0] %
+                    cur_im2col_step) == 0, 'im2col step must divide batchsize'
+            deform_conv_cuda.deform_conv_forward_cuda(
+                input, weight, offset, output, ctx.bufs_[0], ctx.bufs_[1],
+                weight.size(3), weight.size(2), ctx.stride[1], ctx.stride[0],
+                ctx.padding[1], ctx.padding[0], ctx.dilation[1],
+                ctx.dilation[0], ctx.groups, ctx.deformable_groups,
+                cur_im2col_step)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, offset, weight = ctx.saved_tensors
+
+        grad_input = grad_offset = grad_weight = None
+
+        if not grad_output.is_cuda:
+            raise NotImplementedError
+        else:
+            cur_im2col_step = min(ctx.im2col_step, input.shape[0])
+            assert (input.shape[0] %
+                    cur_im2col_step) == 0, 'im2col step must divide batchsize'
+
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+                grad_input = torch.zeros_like(input)
+                grad_offset = torch.zeros_like(offset)
+                deform_conv_cuda.deform_conv_backward_input_cuda(
+                    input, offset, grad_output, grad_input,
+                    grad_offset, weight, ctx.bufs_[0], weight.size(3),
+                    weight.size(2), ctx.stride[1], ctx.stride[0],
+                    ctx.padding[1], ctx.padding[0], ctx.dilation[1],
+                    ctx.dilation[0], ctx.groups, ctx.deformable_groups,
+                    cur_im2col_step)
+
+            if ctx.needs_input_grad[2]:
+                grad_weight = torch.zeros_like(weight)
+                deform_conv_cuda.deform_conv_backward_parameters_cuda(
+                    input, offset, grad_output,
+                    grad_weight, ctx.bufs_[0], ctx.bufs_[1], weight.size(3),
+                    weight.size(2), ctx.stride[1], ctx.stride[0],
+                    ctx.padding[1], ctx.padding[0], ctx.dilation[1],
+                    ctx.dilation[0], ctx.groups, ctx.deformable_groups, 1,
+                    cur_im2col_step)
+
+        return (grad_input, grad_offset, grad_weight, None, None, None, None,
+                None)
+
+    @staticmethod
+    def _output_size(input, weight, padding, dilation, stride):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = padding[d]
+            kernel = dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                "convolution input is too small (output would be {})".format(
+                    'x'.join(map(str, output_size))))
+        return output_size
+
+
+class ModulatedDeformConvFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, offset, mask, weight, bias,stride,padding,dilation,groups,deformable_groups):
+        return g.op("DCNv2", input, offset, mask, weight, bias,
+                    stride_i = stride,padding_i = padding,dilation_i = dilation,
+                    groups_i = groups,deformable_group_i = deformable_groups)
+
+    @staticmethod
+    def forward(ctx,
+                input,
+                offset,
+                mask,
+                weight,
+                bias=None,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                deformable_groups=1):
+        ctx.stride = stride
+        ctx.padding = padding
+        ctx.dilation = dilation
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.with_bias = bias is not None
+        if not ctx.with_bias:
+            bias = input.new_empty(1)  # fake tensor
+        if not input.is_cuda:
+            raise NotImplementedError
+        if weight.requires_grad or mask.requires_grad or offset.requires_grad \
+                or input.requires_grad:
+            ctx.save_for_backward(input, offset, mask, weight, bias)
+        output = input.new_empty(
+            ModulatedDeformConvFunction._infer_shape(ctx, input, weight))
+        ctx._bufs = [input.new_empty(0), input.new_empty(0)]
+        deform_conv_cuda.modulated_deform_conv_cuda_forward(
+            input, weight, bias, ctx._bufs[0], offset, mask, output,
+            ctx._bufs[1], weight.shape[2], weight.shape[3], ctx.stride,
+            ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation,
+            ctx.groups, ctx.deformable_groups, ctx.with_bias)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        if not grad_output.is_cuda:
+            raise NotImplementedError
+        input, offset, mask, weight, bias = ctx.saved_tensors
+        grad_input = torch.zeros_like(input)
+        grad_offset = torch.zeros_like(offset)
+        grad_mask = torch.zeros_like(mask)
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
+        deform_conv_cuda.modulated_deform_conv_cuda_backward(
+            input, weight, bias, ctx._bufs[0], offset, mask, ctx._bufs[1],
+            grad_input, grad_weight, grad_bias, grad_offset, grad_mask,
+            grad_output, weight.shape[2], weight.shape[3], ctx.stride,
+            ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation,
+            ctx.groups, ctx.deformable_groups, ctx.with_bias)
+        if not ctx.with_bias:
+            grad_bias = None
+
+        return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
+                None, None, None, None, None)
+
+    @staticmethod
+    def _infer_shape(ctx, input, weight):
+        n = input.size(0)
+        channels_out = weight.size(0)
+        height, width = input.shape[2:4]
+        kernel_h, kernel_w = weight.shape[2:4]
+        height_out = (height + 2 * ctx.padding -
+                      (ctx.dilation * (kernel_h - 1) + 1)) // ctx.stride + 1
+        width_out = (width + 2 * ctx.padding -
+                     (ctx.dilation * (kernel_w - 1) + 1)) // ctx.stride + 1
+        return n, channels_out, height_out, width_out
+
+
+deform_conv = DeformConvFunction.apply
+modulated_deform_conv = ModulatedDeformConvFunction.apply
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/functions/deform_pool.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/functions/deform_pool.py
new file mode 100644
index 00000000..65ff0efb
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/functions/deform_pool.py
@@ -0,0 +1,69 @@
+import torch
+from torch.autograd import Function
+
+from .. import deform_pool_cuda
+
+
+class DeformRoIPoolingFunction(Function):
+
+    @staticmethod
+    def forward(ctx,
+                data,
+                rois,
+                offset,
+                spatial_scale,
+                out_size,
+                out_channels,
+                no_trans,
+                group_size=1,
+                part_size=None,
+                sample_per_part=4,
+                trans_std=.0):
+        ctx.spatial_scale = spatial_scale
+        ctx.out_size = out_size
+        ctx.out_channels = out_channels
+        ctx.no_trans = no_trans
+        ctx.group_size = group_size
+        ctx.part_size = out_size if part_size is None else part_size
+        ctx.sample_per_part = sample_per_part
+        ctx.trans_std = trans_std
+
+        assert 0.0 <= ctx.trans_std <= 1.0
+        if not data.is_cuda:
+            raise NotImplementedError
+
+        n = rois.shape[0]
+        output = data.new_empty(n, out_channels, out_size, out_size)
+        output_count = data.new_empty(n, out_channels, out_size, out_size)
+        deform_pool_cuda.deform_psroi_pooling_cuda_forward(
+            data, rois, offset, output, output_count, ctx.no_trans,
+            ctx.spatial_scale, ctx.out_channels, ctx.group_size, ctx.out_size,
+            ctx.part_size, ctx.sample_per_part, ctx.trans_std)
+
+        if data.requires_grad or rois.requires_grad or offset.requires_grad:
+            ctx.save_for_backward(data, rois, offset)
+        ctx.output_count = output_count
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        if not grad_output.is_cuda:
+            raise NotImplementedError
+
+        data, rois, offset = ctx.saved_tensors
+        output_count = ctx.output_count
+        grad_input = torch.zeros_like(data)
+        grad_rois = None
+        grad_offset = torch.zeros_like(offset)
+
+        deform_pool_cuda.deform_psroi_pooling_cuda_backward(
+            grad_output, data, rois, offset, output_count, grad_input,
+            grad_offset, ctx.no_trans, ctx.spatial_scale, ctx.out_channels,
+            ctx.group_size, ctx.out_size, ctx.part_size, ctx.sample_per_part,
+            ctx.trans_std)
+        return (grad_input, grad_rois, grad_offset, None, None, None, None,
+                None, None, None, None)
+
+
+deform_roi_pooling = DeformRoIPoolingFunction.apply
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/modules/__init__.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/modules/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/modules/deform_conv.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/modules/deform_conv.py
new file mode 100644
index 00000000..c0b8286d
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/modules/deform_conv.py
@@ -0,0 +1,158 @@
+import math
+
+import torch
+import torch.nn as nn
+from torch.nn.modules.utils import _pair
+
+from ..functions.deform_conv import deform_conv, modulated_deform_conv
+
+
+class DeformConv(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 deformable_groups=1,
+                 bias=False):
+        super(DeformConv, self).__init__()
+
+        assert not bias
+        assert in_channels % groups == 0, \
+            'in_channels {} cannot be divisible by groups {}'.format(
+                in_channels, groups)
+        assert out_channels % groups == 0, \
+            'out_channels {} cannot be divisible by groups {}'.format(
+                out_channels, groups)
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // self.groups,
+                         *self.kernel_size))
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+
+    def forward(self, x, offset):
+        return deform_conv(x, offset, self.weight, self.stride, self.padding,
+                           self.dilation, self.groups, self.deformable_groups)
+
+
+class DeformConvPack(DeformConv):
+
+    def __init__(self, *args, **kwargs):
+        super(DeformConvPack, self).__init__(*args, **kwargs)
+
+        self.conv_offset = nn.Conv2d(
+            self.in_channels,
+            self.deformable_groups * 2 * self.kernel_size[0] *
+            self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=_pair(self.stride),
+            padding=_pair(self.padding),
+            bias=True)
+        self.init_offset()
+
+    def init_offset(self):
+        self.conv_offset.weight.data.zero_()
+        self.conv_offset.bias.data.zero_()
+
+    def forward(self, x):
+        offset = self.conv_offset(x)
+        return deform_conv(x, offset, self.weight, self.stride, self.padding,
+                           self.dilation, self.groups, self.deformable_groups)
+
+
+class ModulatedDeformConv(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 deformable_groups=1,
+                 bias=True):
+        super(ModulatedDeformConv, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+        self.with_bias = bias
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // groups,
+                         *self.kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.data.zero_()
+
+    def forward(self, x, offset, mask):
+        return modulated_deform_conv(x, offset, mask, self.weight, self.bias,
+                                     self.stride, self.padding, self.dilation,
+                                     self.groups, self.deformable_groups)
+
+
+class ModulatedDeformConvPack(ModulatedDeformConv):
+
+    def __init__(self, *args, **kwargs):
+        super(ModulatedDeformConvPack, self).__init__(*args, **kwargs)
+
+        self.conv_offset_mask = nn.Conv2d(
+            self.in_channels,
+            self.deformable_groups * 3 * self.kernel_size[0] *
+            self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=_pair(self.stride),
+            padding=_pair(self.padding),
+            bias=True)
+        self.init_offset()
+
+    def init_offset(self):
+        self.conv_offset_mask.weight.data.zero_()
+        self.conv_offset_mask.bias.data.zero_()
+
+    def forward(self, x):
+
+        out = self.conv_offset_mask(x)
+        o1, o2, mask = torch.chunk(out, 3, dim=1)
+        offset = torch.cat((o1, o2), dim=1)
+        mask = torch.sigmoid(mask)
+        return modulated_deform_conv(x, offset, mask, self.weight, self.bias,
+                                     self.stride, self.padding, self.dilation,
+                                     self.groups, self.deformable_groups)
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/modules/deform_pool.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/modules/deform_pool.py
new file mode 100644
index 00000000..5e019675
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/modules/deform_pool.py
@@ -0,0 +1,172 @@
+from torch import nn
+
+from ..functions.deform_pool import deform_roi_pooling
+
+
+class DeformRoIPooling(nn.Module):
+
+    def __init__(self,
+                 spatial_scale,
+                 out_size,
+                 out_channels,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0):
+        super(DeformRoIPooling, self).__init__()
+        self.spatial_scale = spatial_scale
+        self.out_size = out_size
+        self.out_channels = out_channels
+        self.no_trans = no_trans
+        self.group_size = group_size
+        self.part_size = out_size if part_size is None else part_size
+        self.sample_per_part = sample_per_part
+        self.trans_std = trans_std
+
+    def forward(self, data, rois, offset):
+        if self.no_trans:
+            offset = data.new_empty(0)
+        return deform_roi_pooling(
+            data, rois, offset, self.spatial_scale, self.out_size,
+            self.out_channels, self.no_trans, self.group_size, self.part_size,
+            self.sample_per_part, self.trans_std)
+
+
+class DeformRoIPoolingPack(DeformRoIPooling):
+
+    def __init__(self,
+                 spatial_scale,
+                 out_size,
+                 out_channels,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0,
+                 num_offset_fcs=3,
+                 deform_fc_channels=1024):
+        super(DeformRoIPoolingPack,
+              self).__init__(spatial_scale, out_size, out_channels, no_trans,
+                             group_size, part_size, sample_per_part, trans_std)
+
+        self.num_offset_fcs = num_offset_fcs
+        self.deform_fc_channels = deform_fc_channels
+
+        if not no_trans:
+            seq = []
+            ic = self.out_size * self.out_size * self.out_channels
+            for i in range(self.num_offset_fcs):
+                if i < self.num_offset_fcs - 1:
+                    oc = self.deform_fc_channels
+                else:
+                    oc = self.out_size * self.out_size * 2
+                seq.append(nn.Linear(ic, oc))
+                ic = oc
+                if i < self.num_offset_fcs - 1:
+                    seq.append(nn.ReLU(inplace=True))
+            self.offset_fc = nn.Sequential(*seq)
+            self.offset_fc[-1].weight.data.zero_()
+            self.offset_fc[-1].bias.data.zero_()
+
+    def forward(self, data, rois):
+        assert data.size(1) == self.out_channels
+        if self.no_trans:
+            offset = data.new_empty(0)
+            return deform_roi_pooling(
+                data, rois, offset, self.spatial_scale, self.out_size,
+                self.out_channels, self.no_trans, self.group_size,
+                self.part_size, self.sample_per_part, self.trans_std)
+        else:
+            n = rois.shape[0]
+            offset = data.new_empty(0)
+            x = deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                   self.out_size, self.out_channels, True,
+                                   self.group_size, self.part_size,
+                                   self.sample_per_part, self.trans_std)
+            offset = self.offset_fc(x.view(n, -1))
+            offset = offset.view(n, 2, self.out_size, self.out_size)
+            return deform_roi_pooling(
+                data, rois, offset, self.spatial_scale, self.out_size,
+                self.out_channels, self.no_trans, self.group_size,
+                self.part_size, self.sample_per_part, self.trans_std)
+
+
+class ModulatedDeformRoIPoolingPack(DeformRoIPooling):
+
+    def __init__(self,
+                 spatial_scale,
+                 out_size,
+                 out_channels,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0,
+                 num_offset_fcs=3,
+                 num_mask_fcs=2,
+                 deform_fc_channels=1024):
+        super(ModulatedDeformRoIPoolingPack, self).__init__(
+            spatial_scale, out_size, out_channels, no_trans, group_size,
+            part_size, sample_per_part, trans_std)
+
+        self.num_offset_fcs = num_offset_fcs
+        self.num_mask_fcs = num_mask_fcs
+        self.deform_fc_channels = deform_fc_channels
+
+        if not no_trans:
+            offset_fc_seq = []
+            ic = self.out_size * self.out_size * self.out_channels
+            for i in range(self.num_offset_fcs):
+                if i < self.num_offset_fcs - 1:
+                    oc = self.deform_fc_channels
+                else:
+                    oc = self.out_size * self.out_size * 2
+                offset_fc_seq.append(nn.Linear(ic, oc))
+                ic = oc
+                if i < self.num_offset_fcs - 1:
+                    offset_fc_seq.append(nn.ReLU(inplace=True))
+            self.offset_fc = nn.Sequential(*offset_fc_seq)
+            self.offset_fc[-1].weight.data.zero_()
+            self.offset_fc[-1].bias.data.zero_()
+
+            mask_fc_seq = []
+            ic = self.out_size * self.out_size * self.out_channels
+            for i in range(self.num_mask_fcs):
+                if i < self.num_mask_fcs - 1:
+                    oc = self.deform_fc_channels
+                else:
+                    oc = self.out_size * self.out_size
+                mask_fc_seq.append(nn.Linear(ic, oc))
+                ic = oc
+                if i < self.num_mask_fcs - 1:
+                    mask_fc_seq.append(nn.ReLU(inplace=True))
+                else:
+                    mask_fc_seq.append(nn.Sigmoid())
+            self.mask_fc = nn.Sequential(*mask_fc_seq)
+            self.mask_fc[-2].weight.data.zero_()
+            self.mask_fc[-2].bias.data.zero_()
+
+    def forward(self, data, rois):
+        assert data.size(1) == self.out_channels
+        if self.no_trans:
+            offset = data.new_empty(0)
+            return deform_roi_pooling(
+                data, rois, offset, self.spatial_scale, self.out_size,
+                self.out_channels, self.no_trans, self.group_size,
+                self.part_size, self.sample_per_part, self.trans_std)
+        else:
+            n = rois.shape[0]
+            offset = data.new_empty(0)
+            x = deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                   self.out_size, self.out_channels, True,
+                                   self.group_size, self.part_size,
+                                   self.sample_per_part, self.trans_std)
+            offset = self.offset_fc(x.view(n, -1))
+            offset = offset.view(n, 2, self.out_size, self.out_size)
+            mask = self.mask_fc(x.view(n, -1))
+            mask = mask.view(n, 1, self.out_size, self.out_size)
+            return deform_roi_pooling(
+                data, rois, offset, self.spatial_scale, self.out_size,
+                self.out_channels, self.no_trans, self.group_size,
+                self.part_size, self.sample_per_part, self.trans_std) * mask
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/setup.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/setup.py
new file mode 100644
index 00000000..96380181
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/setup.py
@@ -0,0 +1,15 @@
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+setup(
+    name='deform_conv',
+    ext_modules=[
+        CUDAExtension('deform_conv_cuda', [
+            'src/deform_conv_cuda.cpp',
+            'src/deform_conv_cuda_kernel.cu',
+        ]),
+        CUDAExtension(
+            'deform_pool_cuda',
+            ['src/deform_pool_cuda.cpp', 'src/deform_pool_cuda_kernel.cu']),
+    ],
+    cmdclass={'build_ext': BuildExtension})
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_conv_cuda.cpp b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_conv_cuda.cpp
new file mode 100644
index 00000000..c4563ed8
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_conv_cuda.cpp
@@ -0,0 +1,695 @@
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
+
+#include <torch/extension.h>
+
+#include <cmath>
+#include <vector>
+
+void deformable_im2col(const at::Tensor data_im, const at::Tensor data_offset,
+                       const int channels, const int height, const int width,
+                       const int ksize_h, const int ksize_w, const int pad_h,
+                       const int pad_w, const int stride_h, const int stride_w,
+                       const int dilation_h, const int dilation_w,
+                       const int parallel_imgs, const int deformable_group,
+                       at::Tensor data_col);
+
+void deformable_col2im(const at::Tensor data_col, const at::Tensor data_offset,
+                       const int channels, const int height, const int width,
+                       const int ksize_h, const int ksize_w, const int pad_h,
+                       const int pad_w, const int stride_h, const int stride_w,
+                       const int dilation_h, const int dilation_w,
+                       const int parallel_imgs, const int deformable_group,
+                       at::Tensor grad_im);
+
+void deformable_col2im_coord(
+    const at::Tensor data_col, const at::Tensor data_im,
+    const at::Tensor data_offset, const int channels, const int height,
+    const int width, const int ksize_h, const int ksize_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, at::Tensor grad_offset);
+
+void modulated_deformable_im2col_cuda(
+    const at::Tensor data_im, const at::Tensor data_offset,
+    const at::Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kenerl_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    at::Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+    const at::Tensor data_col, const at::Tensor data_offset,
+    const at::Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kenerl_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    at::Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+    const at::Tensor data_col, const at::Tensor data_im,
+    const at::Tensor data_offset, const at::Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, at::Tensor grad_offset,
+    at::Tensor grad_mask);
+
+void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
+                 at::Tensor weight, int kH, int kW, int dH, int dW, int padH,
+                 int padW, int dilationH, int dilationW, int group,
+                 int deformable_group) {
+  AT_CHECK(weight.ndimension() == 4,
+           "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
+           "but got: %s",
+           weight.ndimension());
+
+  AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  AT_CHECK(kW > 0 && kH > 0,
+           "kernel size should be greater than zero, but got kH: %d kW: %d", kH,
+           kW);
+
+  AT_CHECK((weight.size(2) == kH && weight.size(3) == kW),
+           "kernel size should be consistent with weight, ",
+           "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH,
+           kW, weight.size(2), weight.size(3));
+
+  AT_CHECK(dW > 0 && dH > 0,
+           "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+  AT_CHECK(
+      dilationW > 0 && dilationH > 0,
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH, dilationW);
+
+  int ndim = input.ndimension();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  AT_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
+           ndim);
+
+  long nInputPlane = weight.size(1) * group;
+  long inputHeight = input.size(dimh);
+  long inputWidth = input.size(dimw);
+  long nOutputPlane = weight.size(0);
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  AT_CHECK(nInputPlane % deformable_group == 0,
+           "input channels must divide deformable group size");
+
+  if (outputWidth < 1 || outputHeight < 1)
+    AT_ERROR(
+        "Given input size: (%ld x %ld x %ld). "
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
+        outputWidth);
+
+  AT_CHECK(input.size(1) == nInputPlane,
+           "invalid number of input planes, expected: %d, but got: %d",
+           nInputPlane, input.size(1));
+
+  AT_CHECK((inputHeight >= kH && inputWidth >= kW),
+           "input image is smaller than kernel");
+
+  AT_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+           "invalid spatial size of offset, expected height: %d width: %d, but "
+           "got height: %d width: %d",
+           outputHeight, outputWidth, offset.size(2), offset.size(3));
+
+  AT_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
+           "invalid number of channels of offset");
+
+  if (gradOutput != NULL) {
+    AT_CHECK(gradOutput->size(dimf) == nOutputPlane,
+             "invalid number of gradOutput planes, expected: %d, but got: %d",
+             nOutputPlane, gradOutput->size(dimf));
+
+    AT_CHECK((gradOutput->size(dimh) == outputHeight &&
+              gradOutput->size(dimw) == outputWidth),
+             "invalid size of gradOutput, expected height: %d width: %d , but "
+             "got height: %d width: %d",
+             outputHeight, outputWidth, gradOutput->size(dimh),
+             gradOutput->size(dimw));
+  }
+}
+
+int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight,
+                             at::Tensor offset, at::Tensor output,
+                             at::Tensor columns, at::Tensor ones, int kW,
+                             int kH, int dW, int dH, int padW, int padH,
+                             int dilationW, int dilationH, int group,
+                             int deformable_group, int im2col_step) {
+  // todo: resize columns to include im2col: done
+  // todo: add im2col_step as input
+  // todo: add new output buffer and transpose it to output (or directly
+  // transpose output) todo: possibly change data indexing because of
+  // parallel_imgs
+
+  shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH, padW,
+              dilationH, dilationW, group, deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  weight = weight.contiguous();
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input.unsqueeze_(0);
+    offset.unsqueeze_(0);
+  }
+
+  // todo: assert batchsize dividable by im2col_step
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
+                        outputHeight, outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  at::Tensor output_buffer =
+      at::zeros({batchSize / im2col_step, nOutputPlane,
+                 im2col_step * outputHeight, outputWidth},
+                output.options());
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), group, output_buffer.size(1) / group,
+       output_buffer.size(2), output_buffer.size(3)});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
+                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                      dilationW, im2col_step, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
+                                  .flatten(1)
+                                  .addmm_(weight[g].flatten(1), columns[g])
+                                  .view_as(output_buffer[elt][g]);
+    }
+  }
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
+       output_buffer.size(3), output_buffer.size(4)});
+
+  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
+                                      im2col_step, outputHeight, outputWidth});
+  output_buffer.transpose_(1, 2);
+  output.copy_(output_buffer);
+  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    output = output.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+
+  return 1;
+}
+
+int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset,
+                                    at::Tensor gradOutput, at::Tensor gradInput,
+                                    at::Tensor gradOffset, at::Tensor weight,
+                                    at::Tensor columns, int kW, int kH, int dW,
+                                    int dH, int padW, int padH, int dilationW,
+                                    int dilationH, int group,
+                                    int deformable_group, int im2col_step) {
+  shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW, padH, padW,
+              dilationH, dilationW, group, deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+  weight = weight.contiguous();
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  AT_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  // change order of grad output
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                              inputHeight, inputWidth});
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
+                                deformable_group * 2 * kH * kW, outputHeight,
+                                outputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    // divide into groups
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), group, gradOutput.size(1) / group,
+         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
+         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
+
+    deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane,
+                            inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+                            dilationH, dilationW, im2col_step, deformable_group,
+                            gradOffset[elt]);
+
+    deformable_col2im(columns, offset[elt], nInputPlane, inputHeight,
+                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                      dilationW, im2col_step, deformable_group, gradInput[elt]);
+  }
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+
+  return 1;
+}
+
+int deform_conv_backward_parameters_cuda(
+    at::Tensor input, at::Tensor offset, at::Tensor gradOutput,
+    at::Tensor gradWeight,  // at::Tensor gradBias,
+    at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH,
+    int padW, int padH, int dilationW, int dilationH, int group,
+    int deformable_group, float scale, int im2col_step) {
+  // todo: transpose and reshape outGrad
+  // todo: reshape columns
+  // todo: add im2col_step as input
+
+  shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH, dW, padH,
+              padW, dilationH, dilationW, group, deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = gradWeight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  at::Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
+                             outputHeight, outputWidth});
+  gradOutputBuffer.copy_(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
+                             im2col_step * outputHeight, outputWidth});
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
+                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                      dilationW, im2col_step, deformable_group, columns);
+
+    // divide into group
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
+         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    gradWeight =
+        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
+                         gradWeight.size(2), gradWeight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      gradWeight[g] = gradWeight[g]
+                          .flatten(1)
+                          .addmm_(gradOutputBuffer[elt][g].flatten(1),
+                                  columns[g].transpose(1, 0), 1.0, scale)
+                          .view_as(gradWeight[g]);
+    }
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
+                                  gradWeight.size(2), gradWeight.size(3),
+                                  gradWeight.size(4)});
+  }
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+  }
+
+  return 1;
+}
+
+void modulated_deform_conv_cuda_forward(
+    at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
+    at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns,
+    int kernel_h, int kernel_w, const int stride_h, const int stride_w,
+    const int pad_h, const int pad_w, const int dilation_h,
+    const int dilation_w, const int group, const int deformable_group,
+    const bool with_bias) {
+  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_out = weight.size(0);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  // resize output
+  output = output.view({batch, channels_out, height_out, width_out}).zero_();
+  // resize temporary columns
+  columns =
+      at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
+                input.options());
+
+  output = output.view({output.size(0), group, output.size(1) / group,
+                        output.size(2), output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    modulated_deformable_im2col_cuda(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    // divide into group
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+    for (int g = 0; g < group; g++) {
+      output[b][g] = output[b][g]
+                         .flatten(1)
+                         .addmm_(weight[g].flatten(1), columns[g])
+                         .view_as(output[b][g]);
+    }
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+  }
+
+  output = output.view({output.size(0), output.size(1) * output.size(2),
+                        output.size(3), output.size(4)});
+
+  if (with_bias) {
+    output += bias.view({1, bias.size(0), 1, 1});
+  }
+}
+
+void modulated_deform_conv_cuda_backward(
+    at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
+    at::Tensor offset, at::Tensor mask, at::Tensor columns,
+    at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias,
+    at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias) {
+  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  grad_input = grad_input.view({batch, channels, height, width});
+  columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
+                      input.options());
+
+  grad_output =
+      grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
+                        grad_output.size(2), grad_output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    // divide int group
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                        grad_output[b][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_cuda(
+        columns, input[b], offset[b], mask[b], 1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_cuda(
+        columns, offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, grad_input[b]);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_cuda(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
+                                    grad_weight.size(1), grad_weight.size(2),
+                                    grad_weight.size(3)});
+    if (with_bias)
+      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+    for (int g = 0; g < group; g++) {
+      grad_weight[g] =
+          grad_weight[g]
+              .flatten(1)
+              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+              .view_as(grad_weight[g]);
+      if (with_bias) {
+        grad_bias[g] =
+            grad_bias[g]
+                .view({-1, 1})
+                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+                .view(-1);
+      }
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
+                                    grad_weight.size(2), grad_weight.size(3),
+                                    grad_weight.size(4)});
+    if (with_bias)
+      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+  }
+  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
+                                  grad_output.size(2), grad_output.size(3),
+                                  grad_output.size(4)});
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("deform_conv_forward_cuda", &deform_conv_forward_cuda,
+        "deform forward (CUDA)");
+  m.def("deform_conv_backward_input_cuda", &deform_conv_backward_input_cuda,
+        "deform_conv_backward_input (CUDA)");
+  m.def("deform_conv_backward_parameters_cuda",
+        &deform_conv_backward_parameters_cuda,
+        "deform_conv_backward_parameters (CUDA)");
+  m.def("modulated_deform_conv_cuda_forward",
+        &modulated_deform_conv_cuda_forward,
+        "modulated deform conv forward (CUDA)");
+  m.def("modulated_deform_conv_cuda_backward",
+        &modulated_deform_conv_cuda_backward,
+        "modulated deform conv backward (CUDA)");
+}
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_conv_cuda_kernel.cu b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_conv_cuda_kernel.cu
new file mode 100644
index 00000000..fd560163
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_conv_cuda_kernel.cu
@@ -0,0 +1,866 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+#include <ATen/ATen.h>
+#include <THC/THCAtomics.cuh>
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+
+using namespace at;
+
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+const int kMaxGridNum = 65535;
+
+inline int GET_BLOCKS(const int N)
+{
+  return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS);
+}
+
+template <typename scalar_t>
+__device__ scalar_t deformable_im2col_bilinear(const scalar_t *bottom_data, const int data_width,
+                                               const int height, const int width, scalar_t h, scalar_t w)
+{
+
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  scalar_t lh = h - h_low;
+  scalar_t lw = w - w_low;
+  scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ scalar_t get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w,
+                                        const int h, const int w, const int height, const int width)
+{
+
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename scalar_t>
+__device__ scalar_t get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w,
+                                          const int height, const int width, const scalar_t *im_data,
+                                          const int data_width, const int bp_dir)
+{
+
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+
+  if (bp_dir == 0)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+  else if (bp_dir == 1)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename scalar_t>
+__global__ void deformable_im2col_gpu_kernel(const int n, const scalar_t *data_im, const scalar_t *data_offset,
+                                             const int height, const int width, const int kernel_h, const int kernel_w,
+                                             const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+                                             const int dilation_h, const int dilation_w, const int channel_per_deformable_group,
+                                             const int batch_size, const int num_channels, const int deformable_group,
+                                             const int height_col, const int width_col,
+                                             scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    //const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
+    const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
+    const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i)
+    {
+      for (int j = 0; j < kernel_w; ++j)
+      {
+        const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+        const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+        const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+        scalar_t val = static_cast<scalar_t>(0);
+        const scalar_t h_im = h_in + i * dilation_h + offset_h;
+        const scalar_t w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+        {
+          //const scalar_t map_h = i * dilation_h + offset_h;
+          //const scalar_t map_w = j * dilation_w + offset_w;
+          //const int cur_height = height - h_in;
+          //const int cur_width = width - w_in;
+          //val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
+          val = deformable_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+void deformable_im2col(
+    const at::Tensor data_im, const at::Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, at::Tensor data_col)
+{
+  // num_axes should be smaller than block size
+  // todo: check parallel_imgs is correctly passed in
+  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.type(), "deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        scalar_t *data_col_ = data_col.data<scalar_t>();
+
+        deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_im_, data_offset_, height, width, ksize_h, ksize_w,
+            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, parallel_imgs, channels, deformable_group,
+            height_col, width_col, data_col_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in deformable_im2col: %s\n", cudaGetErrorString(err));
+  }
+}
+
+template <typename scalar_t>
+__global__ void deformable_col2im_gpu_kernel(
+    const int n, const scalar_t *data_col, const scalar_t *data_offset,
+    const int channels, const int height, const int width,
+    const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size, const int deformable_group,
+    const int height_col, const int width_col,
+    scalar_t *grad_im)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) *
+                                                        2 * kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+    const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+    const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const scalar_t cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++)
+    {
+      for (int dx = -2; dx <= 2; dx++)
+      {
+        if (cur_h + dy >= 0 && cur_h + dy < height &&
+            cur_w + dx >= 0 && cur_w + dx < width &&
+            abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1)
+        {
+          int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          scalar_t weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+void deformable_col2im(
+    const at::Tensor data_col, const at::Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h,
+    const int ksize_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int parallel_imgs, const int deformable_group,
+    at::Tensor grad_im)
+{
+
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data<scalar_t>();
+
+        deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_col_, data_offset_, channels, height, width, ksize_h,
+            ksize_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group,
+            parallel_imgs, deformable_group, height_col, width_col, grad_im_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in deformable_col2im: %s\n", cudaGetErrorString(err));
+  }
+}
+
+template <typename scalar_t>
+__global__ void deformable_col2im_coord_gpu_kernel(const int n, const scalar_t *data_col,
+                                                   const scalar_t *data_im, const scalar_t *data_offset,
+                                                   const int channels, const int height, const int width,
+                                                   const int kernel_h, const int kernel_w,
+                                                   const int pad_h, const int pad_w,
+                                                   const int stride_h, const int stride_w,
+                                                   const int dilation_h, const int dilation_w,
+                                                   const int channel_per_deformable_group,
+                                                   const int batch_size, const int offset_channels, const int deformable_group,
+                                                   const int height_col, const int width_col, scalar_t *grad_offset)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    scalar_t val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group *
+                                                  batch_size * width_col * height_col;
+    const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) *
+                                                channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                                                        kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
+    {
+      const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
+      const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+      const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+      scalar_t inv_h = h_in + i * dilation_h + offset_h;
+      scalar_t inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+      {
+        inv_h = inv_w = -2;
+      }
+      const scalar_t weight = get_coordinate_weight(
+          inv_h, inv_w,
+          height, width, data_im_ptr + cnt * height * width, width, bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+void deformable_col2im_coord(
+    const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset,
+    const int channels, const int height, const int width, const int ksize_h,
+    const int ksize_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int parallel_imgs, const int deformable_group, at::Tensor grad_offset)
+{
+
+  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w * deformable_group * parallel_imgs;
+  int channel_per_deformable_group = channels * ksize_h * ksize_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.type(), "deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data<scalar_t>();
+        const scalar_t *data_im_ = data_im.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data<scalar_t>();
+
+        deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_col_, data_im_, data_offset_, channels, height, width,
+            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group,
+            parallel_imgs, 2 * ksize_h * ksize_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_);
+      }));
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_im2col_bilinear(const scalar_t *bottom_data, const int data_width,
+                                         const int height, const int width, scalar_t h, scalar_t w)
+{
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  scalar_t lh = h - h_low;
+  scalar_t lw = w - w_low;
+  scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w,
+                                             const int h, const int w, const int height, const int width)
+{
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w,
+                                               const int height, const int width, const scalar_t *im_data,
+                                               const int data_width, const int bp_dir)
+{
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+
+  if (bp_dir == 0)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+  else if (bp_dir == 1)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_im2col_gpu_kernel(const int n,
+                                                       const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask,
+                                                       const int height, const int width, const int kernel_h, const int kernel_w,
+                                                       const int pad_h, const int pad_w,
+                                                       const int stride_h, const int stride_w,
+                                                       const int dilation_h, const int dilation_w,
+                                                       const int channel_per_deformable_group,
+                                                       const int batch_size, const int num_channels, const int deformable_group,
+                                                       const int height_col, const int width_col,
+                                                       scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
+    const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
+    const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+
+    const scalar_t *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i)
+    {
+      for (int j = 0; j < kernel_w; ++j)
+      {
+        const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+        const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+        const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+        const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+        scalar_t val = static_cast<scalar_t>(0);
+        const scalar_t h_im = h_in + i * dilation_h + offset_h;
+        const scalar_t w_im = w_in + j * dilation_w + offset_w;
+        //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+        {
+          //const float map_h = i * dilation_h + offset_h;
+          //const float map_w = j * dilation_w + offset_w;
+          //const int cur_height = height - h_in;
+          //const int cur_width = width - w_in;
+          //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
+          val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+        //data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_col2im_gpu_kernel(const int n,
+                                                       const scalar_t *data_col, const scalar_t *data_offset, const scalar_t *data_mask,
+                                                       const int channels, const int height, const int width,
+                                                       const int kernel_h, const int kernel_w,
+                                                       const int pad_h, const int pad_w,
+                                                       const int stride_h, const int stride_w,
+                                                       const int dilation_h, const int dilation_w,
+                                                       const int channel_per_deformable_group,
+                                                       const int batch_size, const int deformable_group,
+                                                       const int height_col, const int width_col,
+                                                       scalar_t *grad_im)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+    const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+    const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+    const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+    const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const scalar_t cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++)
+    {
+      for (int dx = -2; dx <= 2; dx++)
+      {
+        if (cur_h + dy >= 0 && cur_h + dy < height &&
+            cur_w + dx >= 0 && cur_w + dx < width &&
+            abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1)
+        {
+          int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          scalar_t weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n,
+                                                             const scalar_t *data_col, const scalar_t *data_im,
+                                                             const scalar_t *data_offset, const scalar_t *data_mask,
+                                                             const int channels, const int height, const int width,
+                                                             const int kernel_h, const int kernel_w,
+                                                             const int pad_h, const int pad_w,
+                                                             const int stride_h, const int stride_w,
+                                                             const int dilation_h, const int dilation_w,
+                                                             const int channel_per_deformable_group,
+                                                             const int batch_size, const int offset_channels, const int deformable_group,
+                                                             const int height_col, const int width_col,
+                                                             scalar_t *grad_offset, scalar_t *grad_mask)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    scalar_t val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
+    const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+    const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
+    {
+      const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
+      const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+      const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+      const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+      scalar_t inv_h = h_in + i * dilation_h + offset_h;
+      scalar_t inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+      {
+        inv_h = inv_w = -2;
+      }
+      else
+      {
+        mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
+      }
+      const scalar_t weight = dmcn_get_coordinate_weight(
+          inv_h, inv_w,
+          height, width, data_im_ptr + cnt * height * width, width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
+  }
+}
+
+void modulated_deformable_im2col_cuda(
+    const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
+    const int batch_size, const int channels, const int height_im, const int width_im,
+    const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int deformable_group, at::Tensor data_col)
+{
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.type(), "modulated_deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data<scalar_t>();
+        scalar_t *data_col_ = data_col.data<scalar_t>();
+
+        modulated_deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w,
+            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, channels, deformable_group, height_col, width_col, data_col_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+void modulated_deformable_col2im_cuda(
+    const at::Tensor data_col, const at::Tensor data_offset, const at::Tensor data_mask,
+    const int batch_size, const int channels, const int height_im, const int width_im,
+    const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int deformable_group, at::Tensor grad_im)
+{
+
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.type(), "modulated_deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data<scalar_t>();
+
+        modulated_deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_col_, data_offset_, data_mask_, channels, height_im, width_im,
+            kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, deformable_group, height_col, width_col, grad_im_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+void modulated_deformable_col2im_coord_cuda(
+    const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
+    const int batch_size, const int channels, const int height_im, const int width_im,
+    const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_offset, at::Tensor grad_mask)
+{
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
+  const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.type(), "modulated_deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data<scalar_t>();
+        const scalar_t *data_im_ = data_im.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data<scalar_t>();
+        scalar_t *grad_mask_ = grad_mask.data<scalar_t>();
+
+        modulated_deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_col_, data_im_, data_offset_, data_mask_, channels, height_im, width_im,
+            kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col,
+            grad_offset_, grad_mask_);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_pool_cuda.cpp b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_pool_cuda.cpp
new file mode 100644
index 00000000..803d5f14
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_pool_cuda.cpp
@@ -0,0 +1,87 @@
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c
+
+// based on
+// author: Charles Shang
+// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
+
+#include <torch/extension.h>
+
+#include <cmath>
+#include <vector>
+
+void DeformablePSROIPoolForward(
+    const at::Tensor data, const at::Tensor bbox, const at::Tensor trans,
+    at::Tensor out, at::Tensor top_count, const int batch, const int channels,
+    const int height, const int width, const int num_bbox,
+    const int channels_trans, const int no_trans, const float spatial_scale,
+    const int output_dim, const int group_size, const int pooled_size,
+    const int part_size, const int sample_per_part, const float trans_std);
+
+void DeformablePSROIPoolBackwardAcc(
+    const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox,
+    const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad,
+    at::Tensor trans_grad, const int batch, const int channels,
+    const int height, const int width, const int num_bbox,
+    const int channels_trans, const int no_trans, const float spatial_scale,
+    const int output_dim, const int group_size, const int pooled_size,
+    const int part_size, const int sample_per_part, const float trans_std);
+
+void deform_psroi_pooling_cuda_forward(
+    at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out,
+    at::Tensor top_count, const int no_trans, const float spatial_scale,
+    const int output_dim, const int group_size, const int pooled_size,
+    const int part_size, const int sample_per_part, const float trans_std) {
+  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+
+  const int num_bbox = bbox.size(0);
+  if (num_bbox != out.size(0))
+    AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
+             out.size(0), num_bbox);
+
+  DeformablePSROIPoolForward(
+      input, bbox, trans, out, top_count, batch, channels, height, width,
+      num_bbox, channels_trans, no_trans, spatial_scale, output_dim, group_size,
+      pooled_size, part_size, sample_per_part, trans_std);
+}
+
+void deform_psroi_pooling_cuda_backward(
+    at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans,
+    at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad,
+    const int no_trans, const float spatial_scale, const int output_dim,
+    const int group_size, const int pooled_size, const int part_size,
+    const int sample_per_part, const float trans_std) {
+  AT_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
+  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+
+  const int num_bbox = bbox.size(0);
+  if (num_bbox != out_grad.size(0))
+    AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
+             out_grad.size(0), num_bbox);
+
+  DeformablePSROIPoolBackwardAcc(
+      out_grad, input, bbox, trans, top_count, input_grad, trans_grad, batch,
+      channels, height, width, num_bbox, channels_trans, no_trans,
+      spatial_scale, output_dim, group_size, pooled_size, part_size,
+      sample_per_part, trans_std);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("deform_psroi_pooling_cuda_forward", &deform_psroi_pooling_cuda_forward,
+        "deform psroi pooling forward(CUDA)");
+  m.def("deform_psroi_pooling_cuda_backward",
+        &deform_psroi_pooling_cuda_backward,
+        "deform psroi pooling backward(CUDA)");
+}
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_pool_cuda_kernel.cu b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_pool_cuda_kernel.cu
new file mode 100644
index 00000000..e4944600
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dcn/src/deform_pool_cuda_kernel.cu
@@ -0,0 +1,364 @@
+/*!
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file deformable_psroi_pooling.cu
+ * \brief
+ * \author Yi Li, Guodong Zhang, Jifeng Dai
+*/
+/***************** Adapted by Charles Shang *********************/
+// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/cuda/deform_psroi_pooling_cuda.cu
+
+#include <ATen/ATen.h>
+#include <THC/THCAtomics.cuh>
+#include <stdio.h>
+#include <math.h>
+#include <algorithm>
+
+using namespace at;
+
+#define CUDA_KERNEL_LOOP(i, n)                        \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+       i < (n);                                       \
+       i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N)
+{
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+template <typename scalar_t>
+__device__ scalar_t bilinear_interp(
+    const scalar_t *data,
+    const scalar_t x,
+    const scalar_t y,
+    const int width,
+    const int height)
+{
+  int x1 = floor(x);
+  int x2 = ceil(x);
+  int y1 = floor(y);
+  int y2 = ceil(y);
+  scalar_t dist_x = (scalar_t)(x - x1);
+  scalar_t dist_y = (scalar_t)(y - y1);
+  scalar_t value11 = data[y1 * width + x1];
+  scalar_t value12 = data[y2 * width + x1];
+  scalar_t value21 = data[y1 * width + x2];
+  scalar_t value22 = data[y2 * width + x2];
+  scalar_t value = (1 - dist_x) * (1 - dist_y) * value11 + (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 + dist_x * dist_y * value22;
+  return value;
+}
+
+template <typename scalar_t>
+__global__ void DeformablePSROIPoolForwardKernel(
+    const int count,
+    const scalar_t *bottom_data,
+    const scalar_t spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const scalar_t *bottom_rois, const scalar_t *bottom_trans,
+    const int no_trans,
+    const scalar_t trans_std,
+    const int sample_per_part,
+    const int output_dim,
+    const int group_size,
+    const int part_size,
+    const int num_classes,
+    const int channels_each_class,
+    scalar_t *top_data,
+    scalar_t *top_count)
+{
+  CUDA_KERNEL_LOOP(index, count)
+  {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+    // Force too small ROIs to be 1x1
+    scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
+    scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+    // Compute w and h at bottom
+    scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height);
+    scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width);
+
+    scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part);
+    scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part);
+
+    int part_h = floor((scalar_t)(ph) / pooled_height * part_size);
+    int part_w = floor((scalar_t)(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+    scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+
+    scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    scalar_t sum = 0;
+    int count = 0;
+    int gw = floor((scalar_t)(pw)*group_size / pooled_width);
+    int gh = floor((scalar_t)(ph)*group_size / pooled_height);
+    gw = min(max(gw, 0), group_size - 1);
+    gh = min(max(gh, 0), group_size - 1);
+
+    const scalar_t *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
+    for (int ih = 0; ih < sample_per_part; ih++)
+    {
+      for (int iw = 0; iw < sample_per_part; iw++)
+      {
+        scalar_t w = wstart + iw * sub_bin_size_w;
+        scalar_t h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+        {
+          continue;
+        }
+        w = min(max(w, 0.), width - 1.);
+        h = min(max(h, 0.), height - 1.);
+        int c = (ctop * group_size + gh) * group_size + gw;
+        scalar_t val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height);
+        sum += val;
+        count++;
+      }
+    }
+    top_data[index] = count == 0 ? (scalar_t)(0) : sum / count;
+    top_count[index] = count;
+  }
+}
+
+template <typename scalar_t>
+__global__ void DeformablePSROIPoolBackwardAccKernel(
+    const int count,
+    const scalar_t *top_diff,
+    const scalar_t *top_count,
+    const int num_rois,
+    const scalar_t spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const int output_dim,
+    scalar_t *bottom_data_diff, scalar_t *bottom_trans_diff,
+    const scalar_t *bottom_data,
+    const scalar_t *bottom_rois,
+    const scalar_t *bottom_trans,
+    const int no_trans,
+    const scalar_t trans_std,
+    const int sample_per_part,
+    const int group_size,
+    const int part_size,
+    const int num_classes,
+    const int channels_each_class)
+{
+  CUDA_KERNEL_LOOP(index, count)
+  {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+    // Force too small ROIs to be 1x1
+    scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
+    scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+    // Compute w and h at bottom
+    scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height);
+    scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width);
+
+    scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part);
+    scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part);
+
+    int part_h = floor((scalar_t)(ph) / pooled_height * part_size);
+    int part_w = floor((scalar_t)(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+    scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+
+    scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    if (top_count[index] <= 0)
+    {
+      continue;
+    }
+    scalar_t diff_val = top_diff[index] / top_count[index];
+    const scalar_t *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
+    scalar_t *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
+    int gw = floor((scalar_t)(pw)*group_size / pooled_width);
+    int gh = floor((scalar_t)(ph)*group_size / pooled_height);
+    gw = min(max(gw, 0), group_size - 1);
+    gh = min(max(gh, 0), group_size - 1);
+
+    for (int ih = 0; ih < sample_per_part; ih++)
+    {
+      for (int iw = 0; iw < sample_per_part; iw++)
+      {
+        scalar_t w = wstart + iw * sub_bin_size_w;
+        scalar_t h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+        {
+          continue;
+        }
+        w = min(max(w, 0.), width - 1.);
+        h = min(max(h, 0.), height - 1.);
+        int c = (ctop * group_size + gh) * group_size + gw;
+        // backward on feature
+        int x0 = floor(w);
+        int x1 = ceil(w);
+        int y0 = floor(h);
+        int y1 = ceil(h);
+        scalar_t dist_x = w - x0, dist_y = h - y0;
+        scalar_t q00 = (1 - dist_x) * (1 - dist_y);
+        scalar_t q01 = (1 - dist_x) * dist_y;
+        scalar_t q10 = dist_x * (1 - dist_y);
+        scalar_t q11 = dist_x * dist_y;
+        int bottom_index_base = c * height * width;
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);
+
+        if (no_trans)
+        {
+          continue;
+        }
+        scalar_t U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
+        scalar_t U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
+        scalar_t U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
+        scalar_t U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
+        scalar_t diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
+        diff_x *= roi_width;
+        scalar_t diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
+        diff_y *= roi_height;
+
+        atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
+        atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);
+      }
+    }
+  }
+}
+
+void DeformablePSROIPoolForward(const at::Tensor data,
+                                const at::Tensor bbox,
+                                const at::Tensor trans,
+                                at::Tensor out,
+                                at::Tensor top_count,
+                                const int batch,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int num_bbox,
+                                const int channels_trans,
+                                const int no_trans,
+                                const float spatial_scale,
+                                const int output_dim,
+                                const int group_size,
+                                const int pooled_size,
+                                const int part_size,
+                                const int sample_per_part,
+                                const float trans_std)
+{
+  const int pooled_height = pooled_size;
+  const int pooled_width = pooled_size;
+  const int count = num_bbox * output_dim * pooled_height * pooled_width;
+  const int num_classes = no_trans ? 1 : channels_trans / 2;
+  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data.type(), "deformable_psroi_pool_forward", ([&] {
+        const scalar_t *bottom_data = data.data<scalar_t>();
+        const scalar_t *bottom_rois = bbox.data<scalar_t>();
+        const scalar_t *bottom_trans = no_trans ? NULL : trans.data<scalar_t>();
+        scalar_t *top_data = out.data<scalar_t>();
+        scalar_t *top_count_data = top_count.data<scalar_t>();
+
+        DeformablePSROIPoolForwardKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS>>>(
+            count, bottom_data, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width,
+            bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part, output_dim,
+            group_size, part_size, num_classes, channels_each_class, top_data, top_count_data);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err));
+  }
+}
+
+void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad,
+                                    const at::Tensor data,
+                                    const at::Tensor bbox,
+                                    const at::Tensor trans,
+                                    const at::Tensor top_count,
+                                    at::Tensor in_grad,
+                                    at::Tensor trans_grad,
+                                    const int batch,
+                                    const int channels,
+                                    const int height,
+                                    const int width,
+                                    const int num_bbox,
+                                    const int channels_trans,
+                                    const int no_trans,
+                                    const float spatial_scale,
+                                    const int output_dim,
+                                    const int group_size,
+                                    const int pooled_size,
+                                    const int part_size,
+                                    const int sample_per_part,
+                                    const float trans_std)
+{
+  // LOG(INFO) << "DeformablePSROIPoolBackward";
+  const int num_rois = num_bbox;
+  const int pooled_height = pooled_size;
+  const int pooled_width = pooled_size;
+  const int count = num_bbox * output_dim * pooled_height * pooled_width;
+  const int num_classes = no_trans ? 1 : channels_trans / 2;
+  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      out_grad.type(), "deformable_psroi_pool_backward_acc", ([&] {
+        const scalar_t *top_diff = out_grad.data<scalar_t>();
+        const scalar_t *bottom_data = data.data<scalar_t>();
+        const scalar_t *bottom_rois = bbox.data<scalar_t>();
+        const scalar_t *bottom_trans = no_trans ? NULL : trans.data<scalar_t>();
+        scalar_t *bottom_data_diff = in_grad.data<scalar_t>();
+        scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data<scalar_t>();
+        const scalar_t *top_count_data = top_count.data<scalar_t>();
+
+        DeformablePSROIPoolBackwardAccKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS>>>(
+            count, top_diff, top_count_data, num_rois, (scalar_t)spatial_scale, channels, height, width,
+            pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff,
+            bottom_data, bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part,
+            group_size, part_size, num_classes, channels_each_class);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err));
+  }
+}
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dlav0.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dlav0.py
new file mode 100644
index 00000000..92fdbcf8
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/dlav0.py
@@ -0,0 +1,647 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+from os.path import join
+
+import torch
+from torch import nn
+import torch.utils.model_zoo as model_zoo
+
+import numpy as np
+
+BatchNorm = nn.BatchNorm2d
+
+def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'):
+    return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash))
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3,
+                               stride=stride, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn1 = BatchNorm(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
+                               stride=1, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn2 = BatchNorm(planes)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 2
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(Bottleneck, self).__init__()
+        expansion = Bottleneck.expansion
+        bottle_planes = planes // expansion
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes,
+                               kernel_size=1, bias=False)
+        self.bn1 = BatchNorm(bottle_planes)
+        self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
+                               stride=stride, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn2 = BatchNorm(bottle_planes)
+        self.conv3 = nn.Conv2d(bottle_planes, planes,
+                               kernel_size=1, bias=False)
+        self.bn3 = BatchNorm(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class BottleneckX(nn.Module):
+    expansion = 2
+    cardinality = 32
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BottleneckX, self).__init__()
+        cardinality = BottleneckX.cardinality
+        # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0)))
+        # bottle_planes = dim * cardinality
+        bottle_planes = planes * cardinality // 32
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes,
+                               kernel_size=1, bias=False)
+        self.bn1 = BatchNorm(bottle_planes)
+        self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
+                               stride=stride, padding=dilation, bias=False,
+                               dilation=dilation, groups=cardinality)
+        self.bn2 = BatchNorm(bottle_planes)
+        self.conv3 = nn.Conv2d(bottle_planes, planes,
+                               kernel_size=1, bias=False)
+        self.bn3 = BatchNorm(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Root(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, residual):
+        super(Root, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels, out_channels, 1,
+            stride=1, bias=False, padding=(kernel_size - 1) // 2)
+        self.bn = BatchNorm(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+        self.residual = residual
+
+    def forward(self, *x):
+        children = x
+        x = self.conv(torch.cat(x, 1))
+        x = self.bn(x)
+        if self.residual:
+            x += children[0]
+        x = self.relu(x)
+
+        return x
+
+
+class Tree(nn.Module):
+    def __init__(self, levels, block, in_channels, out_channels, stride=1,
+                 level_root=False, root_dim=0, root_kernel_size=1,
+                 dilation=1, root_residual=False):
+        super(Tree, self).__init__()
+        if root_dim == 0:
+            root_dim = 2 * out_channels
+        if level_root:
+            root_dim += in_channels
+        if levels == 1:
+            self.tree1 = block(in_channels, out_channels, stride,
+                               dilation=dilation)
+            self.tree2 = block(out_channels, out_channels, 1,
+                               dilation=dilation)
+        else:
+            self.tree1 = Tree(levels - 1, block, in_channels, out_channels,
+                              stride, root_dim=0,
+                              root_kernel_size=root_kernel_size,
+                              dilation=dilation, root_residual=root_residual)
+            self.tree2 = Tree(levels - 1, block, out_channels, out_channels,
+                              root_dim=root_dim + out_channels,
+                              root_kernel_size=root_kernel_size,
+                              dilation=dilation, root_residual=root_residual)
+        if levels == 1:
+            self.root = Root(root_dim, out_channels, root_kernel_size,
+                             root_residual)
+        self.level_root = level_root
+        self.root_dim = root_dim
+        self.downsample = None
+        self.project = None
+        self.levels = levels
+        if stride > 1:
+            self.downsample = nn.MaxPool2d(stride, stride=stride)
+        if in_channels != out_channels:
+            self.project = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels,
+                          kernel_size=1, stride=1, bias=False),
+                BatchNorm(out_channels)
+            )
+
+    def forward(self, x, residual=None, children=None):
+        children = [] if children is None else children
+        bottom = self.downsample(x) if self.downsample else x
+        residual = self.project(bottom) if self.project else bottom
+        if self.level_root:
+            children.append(bottom)
+        x1 = self.tree1(x, residual)
+        if self.levels == 1:
+            x2 = self.tree2(x1)
+            x = self.root(x2, x1, *children)
+        else:
+            children.append(x1)
+            x = self.tree2(x1, children=children)
+        return x
+
+
+class DLA(nn.Module):
+    def __init__(self, levels, channels, num_classes=1000,
+                 block=BasicBlock, residual_root=False, return_levels=False,
+                 pool_size=7, linear_root=False):
+        super(DLA, self).__init__()
+        self.channels = channels
+        self.return_levels = return_levels
+        self.num_classes = num_classes
+        self.base_layer = nn.Sequential(
+            nn.Conv2d(3, channels[0], kernel_size=7, stride=1,
+                      padding=3, bias=False),
+            BatchNorm(channels[0]),
+            nn.ReLU(inplace=True))
+        self.level0 = self._make_conv_level(
+            channels[0], channels[0], levels[0])
+        self.level1 = self._make_conv_level(
+            channels[0], channels[1], levels[1], stride=2)
+        self.level2 = Tree(levels[2], block, channels[1], channels[2], 2,
+                           level_root=False,
+                           root_residual=residual_root)
+        self.level3 = Tree(levels[3], block, channels[2], channels[3], 2,
+                           level_root=True, root_residual=residual_root)
+        self.level4 = Tree(levels[4], block, channels[3], channels[4], 2,
+                           level_root=True, root_residual=residual_root)
+        self.level5 = Tree(levels[5], block, channels[4], channels[5], 2,
+                           level_root=True, root_residual=residual_root)
+
+        self.avgpool = nn.AvgPool2d(pool_size)
+        self.fc = nn.Conv2d(channels[-1], num_classes, kernel_size=1,
+                            stride=1, padding=0, bias=True)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_level(self, block, inplanes, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes:
+            downsample = nn.Sequential(
+                nn.MaxPool2d(stride, stride=stride),
+                nn.Conv2d(inplanes, planes,
+                          kernel_size=1, stride=1, bias=False),
+                BatchNorm(planes),
+            )
+
+        layers = []
+        layers.append(block(inplanes, planes, stride, downsample=downsample))
+        for i in range(1, blocks):
+            layers.append(block(inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
+        modules = []
+        for i in range(convs):
+            modules.extend([
+                nn.Conv2d(inplanes, planes, kernel_size=3,
+                          stride=stride if i == 0 else 1,
+                          padding=dilation, bias=False, dilation=dilation),
+                BatchNorm(planes),
+                nn.ReLU(inplace=True)])
+            inplanes = planes
+        return nn.Sequential(*modules)
+
+    def forward(self, x):
+        y = []
+        x = self.base_layer(x)
+        for i in range(6):
+            x = getattr(self, 'level{}'.format(i))(x)
+            y.append(x)
+        if self.return_levels:
+            return y
+        else:
+            x = self.avgpool(x)
+            x = self.fc(x)
+            x = x.view(x.size(0), -1)
+
+            return x
+
+    def load_pretrained_model(self,  data='imagenet', name='dla34', hash='ba72cf86'):
+        fc = self.fc
+        if name.endswith('.pth'):
+            model_weights = torch.load(data + name)
+        else:
+            model_url = get_model_url(data, name, hash)
+            model_weights = model_zoo.load_url(model_url)
+        num_classes = len(model_weights[list(model_weights.keys())[-1]])
+        self.fc = nn.Conv2d(
+            self.channels[-1], num_classes,
+            kernel_size=1, stride=1, padding=0, bias=True)
+        self.load_state_dict(model_weights)
+        self.fc = fc
+
+
+def dla34(pretrained, **kwargs):  # DLA-34
+    model = DLA([1, 1, 1, 2, 2, 1],
+                [16, 32, 64, 128, 256, 512],
+                block=BasicBlock, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla34', hash='ba72cf86')
+    return model
+
+
+def dla46_c(pretrained=None, **kwargs):  # DLA-46-C
+    Bottleneck.expansion = 2
+    model = DLA([1, 1, 1, 2, 2, 1],
+                [16, 32, 64, 64, 128, 256],
+                block=Bottleneck, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla46_c')
+    return model
+
+
+def dla46x_c(pretrained=None, **kwargs):  # DLA-X-46-C
+    BottleneckX.expansion = 2
+    model = DLA([1, 1, 1, 2, 2, 1],
+                [16, 32, 64, 64, 128, 256],
+                block=BottleneckX, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla46x_c')
+    return model
+
+
+def dla60x_c(pretrained, **kwargs):  # DLA-X-60-C
+    BottleneckX.expansion = 2
+    model = DLA([1, 1, 1, 2, 3, 1],
+                [16, 32, 64, 64, 128, 256],
+                block=BottleneckX, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla60x_c', hash='b870c45c')
+    return model
+
+
+def dla60(pretrained=None, **kwargs):  # DLA-60
+    Bottleneck.expansion = 2
+    model = DLA([1, 1, 1, 2, 3, 1],
+                [16, 32, 128, 256, 512, 1024],
+                block=Bottleneck, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla60')
+    return model
+
+
+def dla60x(pretrained=None, **kwargs):  # DLA-X-60
+    BottleneckX.expansion = 2
+    model = DLA([1, 1, 1, 2, 3, 1],
+                [16, 32, 128, 256, 512, 1024],
+                block=BottleneckX, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla60x')
+    return model
+
+
+def dla102(pretrained=None, **kwargs):  # DLA-102
+    Bottleneck.expansion = 2
+    model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024],
+                block=Bottleneck, residual_root=True, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla102')
+    return model
+
+
+def dla102x(pretrained=None, **kwargs):  # DLA-X-102
+    BottleneckX.expansion = 2
+    model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024],
+                block=BottleneckX, residual_root=True, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla102x')
+    return model
+
+
+def dla102x2(pretrained=None, **kwargs):  # DLA-X-102 64
+    BottleneckX.cardinality = 64
+    model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024],
+                block=BottleneckX, residual_root=True, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla102x2')
+    return model
+
+
+def dla169(pretrained=None, **kwargs):  # DLA-169
+    Bottleneck.expansion = 2
+    model = DLA([1, 1, 2, 3, 5, 1], [16, 32, 128, 256, 512, 1024],
+                block=Bottleneck, residual_root=True, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla169')
+    return model
+
+
+def set_bn(bn):
+    global BatchNorm
+    BatchNorm = bn
+    dla.BatchNorm = bn
+
+
+class Identity(nn.Module):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+def fill_up_weights(up):
+    w = up.weight.data
+    f = math.ceil(w.size(2) / 2)
+    c = (2 * f - 1 - f % 2) / (2. * f)
+    for i in range(w.size(2)):
+        for j in range(w.size(3)):
+            w[0, 0, i, j] = \
+                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
+    for c in range(1, w.size(0)):
+        w[c, 0, :, :] = w[0, 0, :, :]
+
+
+class IDAUp(nn.Module):
+    def __init__(self, node_kernel, out_dim, channels, up_factors):
+        super(IDAUp, self).__init__()
+        self.channels = channels
+        self.out_dim = out_dim
+        for i, c in enumerate(channels):
+            if c == out_dim:
+                proj = Identity()
+            else:
+                proj = nn.Sequential(
+                    nn.Conv2d(c, out_dim,
+                              kernel_size=1, stride=1, bias=False),
+                    BatchNorm(out_dim),
+                    nn.ReLU(inplace=True))
+            f = int(up_factors[i])
+            if f == 1:
+                up = Identity()
+            else:
+                up = nn.ConvTranspose2d(
+                    out_dim, out_dim, f * 2, stride=f, padding=f // 2,
+                    output_padding=0, groups=out_dim, bias=False)
+                fill_up_weights(up)
+            setattr(self, 'proj_' + str(i), proj)
+            setattr(self, 'up_' + str(i), up)
+
+        for i in range(1, len(channels)):
+            node = nn.Sequential(
+                nn.Conv2d(out_dim * 2, out_dim,
+                          kernel_size=node_kernel, stride=1,
+                          padding=node_kernel // 2, bias=False),
+                BatchNorm(out_dim),
+                nn.ReLU(inplace=True))
+            setattr(self, 'node_' + str(i), node)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def forward(self, layers):
+        assert len(self.channels) == len(layers), \
+            '{} vs {} layers'.format(len(self.channels), len(layers))
+        layers = list(layers)
+        for i, l in enumerate(layers):
+            upsample = getattr(self, 'up_' + str(i))
+            project = getattr(self, 'proj_' + str(i))
+            layers[i] = upsample(project(l))
+        x = layers[0]
+        y = []
+        for i in range(1, len(layers)):
+            node = getattr(self, 'node_' + str(i))
+            x = node(torch.cat([x, layers[i]], 1))
+            y.append(x)
+        return x, y
+
+
+class DLAUp(nn.Module):
+    def __init__(self, channels, scales=(1, 2, 4, 8, 16), in_channels=None):
+        super(DLAUp, self).__init__()
+        if in_channels is None:
+            in_channels = channels
+        self.channels = channels
+        channels = list(channels)
+        scales = np.array(scales, dtype=int)
+        for i in range(len(channels) - 1):
+            j = -i - 2
+            setattr(self, 'ida_{}'.format(i),
+                    IDAUp(3, channels[j], in_channels[j:],
+                          scales[j:] // scales[j]))
+            scales[j + 1:] = scales[j]
+            in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]]
+
+    def forward(self, layers):
+        layers = list(layers)
+        assert len(layers) > 1
+        for i in range(len(layers) - 1):
+            ida = getattr(self, 'ida_{}'.format(i))
+            x, y = ida(layers[-i - 2:])
+            layers[-i - 1:] = y
+        return x
+
+def fill_fc_weights(layers):
+    for m in layers.modules():
+        if isinstance(m, nn.Conv2d):
+            nn.init.normal_(m.weight, std=0.001)
+            # torch.nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu')
+            # torch.nn.init.xavier_normal_(m.weight.data)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+class DLASeg(nn.Module):
+    def __init__(self, base_name, heads,
+                 pretrained=True, down_ratio=4, head_conv=256):
+        super(DLASeg, self).__init__()
+        assert down_ratio in [2, 4, 8, 16]
+        self.heads = heads
+        self.first_level = int(np.log2(down_ratio))
+        self.base = globals()[base_name](
+          pretrained=pretrained, return_levels=True)
+        channels = self.base.channels
+        scales = [2 ** i for i in range(len(channels[self.first_level:]))]
+        self.dla_up = DLAUp(channels[self.first_level:], scales=scales)
+        '''
+        self.fc = nn.Sequential(
+            nn.Conv2d(channels[self.first_level], classes, kernel_size=1,
+                      stride=1, padding=0, bias=True)
+        )
+        '''
+
+        for head in self.heads:
+            classes = self.heads[head]
+            if head_conv > 0:
+                fc = nn.Sequential(
+                  nn.Conv2d(channels[self.first_level], head_conv,
+                    kernel_size=3, padding=1, bias=True),
+                  nn.ReLU(inplace=True),
+                  nn.Conv2d(head_conv, classes, 
+                    kernel_size=1, stride=1, 
+                    padding=0, bias=True))
+                if 'hm' in head:
+                    fc[-1].bias.data.fill_(-2.19)
+                else:
+                    fill_fc_weights(fc)
+            else:
+                fc = nn.Conv2d(channels[self.first_level], classes, 
+                  kernel_size=1, stride=1, 
+                  padding=0, bias=True)
+                if 'hm' in head:
+                    fc.bias.data.fill_(-2.19)
+                else:
+                    fill_fc_weights(fc)
+            self.__setattr__(head, fc)
+
+        '''
+        up_factor = 2 ** self.first_level
+        if up_factor > 1:
+            up = nn.ConvTranspose2d(classes, classes, up_factor * 2,
+                                    stride=up_factor, padding=up_factor // 2,
+                                    output_padding=0, groups=classes,
+                                    bias=False)
+            fill_up_weights(up)
+            up.weight.requires_grad = False
+        else:
+            up = Identity()
+        self.up = up
+        self.softmax = nn.LogSoftmax(dim=1)
+        
+
+        for m in self.fc.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+        '''
+
+    def forward(self, x):
+        x = self.base(x)
+        x = self.dla_up(x[self.first_level:])
+        # x = self.fc(x)
+        # y = self.softmax(self.up(x))
+        ret = {}
+        for head in self.heads:
+            ret[head] = self.__getattr__(head)(x)
+        return [ret]
+
+    '''
+    def optim_parameters(self, memo=None):
+        for param in self.base.parameters():
+            yield param
+        for param in self.dla_up.parameters():
+            yield param
+        for param in self.fc.parameters():
+            yield param
+    '''
+'''
+def dla34up(classes, pretrained_base=None, **kwargs):
+    model = DLASeg('dla34', classes, pretrained_base=pretrained_base, **kwargs)
+    return model
+
+
+def dla60up(classes, pretrained_base=None, **kwargs):
+    model = DLASeg('dla60', classes, pretrained_base=pretrained_base, **kwargs)
+    return model
+
+
+def dla102up(classes, pretrained_base=None, **kwargs):
+    model = DLASeg('dla102', classes,
+                   pretrained_base=pretrained_base, **kwargs)
+    return model
+
+
+def dla169up(classes, pretrained_base=None, **kwargs):
+    model = DLASeg('dla169', classes,
+                   pretrained_base=pretrained_base, **kwargs)
+    return model
+'''
+
+def get_pose_net(num_layers, heads, add_conv=256, down_ratio=4):
+  model = DLASeg('dla{}'.format(num_layers), heads,
+                 pretrained=True,
+                 down_ratio=down_ratio,
+                 head_conv=head_conv)
+  return model
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_mask_resnet.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_mask_resnet.py
new file mode 100644
index 00000000..c2ed2588
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_mask_resnet.py
@@ -0,0 +1,363 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# Modified by Xingyi Zhou
+# ------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as model_zoo
+
+BN_MOMENTUM = 0.1
+
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=0, bias=False)
+
+def pad_same(x,ksize,stride=1,Pool=False):
+    shape = x.shape
+    n_in,w,h = shape[1],shape[2],shape[3]
+    if h % stride == 0:
+        pad_along_height = max(ksize - stride, 0)
+    else:
+        pad_along_height = max(ksize - (h % stride), 0)
+    if w % stride == 0:
+        pad_along_width = max(ksize - stride, 0)
+    else:
+        pad_along_width = max(ksize - (w % stride), 0)
+    pad_bottom = pad_along_height // 2
+    pad_top = pad_along_height - pad_bottom
+    pad_right = pad_along_width // 2
+    pad_left = pad_along_width - pad_right
+    dim = (pad_left,pad_right,pad_top,pad_bottom)
+    if Pool:
+        dim = (pad_right,pad_left,pad_bottom,pad_top)
+    x = F.pad(x,dim,"constant",value=0)
+    return x
+
+class ChannelAttention(nn.Module):
+    def __init__(self, in_planes, ratio=16):
+        super(ChannelAttention, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.max_pool = nn.AdaptiveMaxPool2d(1)
+
+        self.fc1 = nn.Conv2d(in_planes, in_planes // ratio, 1, bias=False)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Conv2d(in_planes // ratio, in_planes, 1, bias=False)
+
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        avg_out = self.fc2(self.relu1(self.fc1(self.avg_pool(x))))
+        max_out = self.fc2(self.relu1(self.fc1(self.max_pool(x))))
+
+        out = avg_out + max_out
+
+        return self.sigmoid(out)
+
+class SpatialAttention(nn.Module):
+    def __init__(self):
+        super(SpatialAttention, self).__init__()
+
+        self.conv1 = nn.Conv2d(2,1,3,padding=1,bias=False)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        avg_out = torch.mean(x, dim=1, keepdim=True)
+        max_out,_ = torch.max(x, dim=1, keepdim=True)
+        x = torch.cat([avg_out,max_out],dim=1)
+        x = self.conv1(x)
+        return self.sigmoid(x)
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+        x = pad_same(x,3,self.stride)
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = pad_same(out,3,1)
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(residual)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
+                               bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
+                                  momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class PoseResNet(nn.Module):
+
+    def __init__(self, block, layers, heads, head_conv, **kwargs):
+        self.inplanes = 64
+        self.deconv_with_bias = False
+        self.heads = heads
+
+        super(PoseResNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=0,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=0)
+        self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+
+        self.adaption3 = nn.Conv2d(256,256, kernel_size=1, stride=1, padding=0,bias=False)
+        self.adaption2 = nn.Conv2d(128,256, kernel_size=1, stride=1, padding=0,bias=False)
+        self.adaption1 = nn.Conv2d(64 ,256, kernel_size=1, stride=1, padding=0,bias=False)
+        self.adaption0 = nn.Conv2d(64 ,256, kernel_size=1, stride=1, padding=0,bias=False)
+
+        self.adaptionU1 = nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0,bias=False)
+
+        # used for deconv layers
+        self.deconv_layers1 = self._make_deconv_layer(1, [256], [4],)
+        self.deconv_layers2 = self._make_deconv_layer(1, [256], [4],)
+        self.deconv_layers3 = self._make_deconv_layer(1, [256], [4],)
+        self.deconv_layers4 = self._make_deconv_layer(1, [256], [4],)
+
+        # self.final_layer = []
+
+        for head in sorted(self.heads):
+          num_output = self.heads[head]
+          if head_conv > 0:
+            inchannel = 256
+            fc = nn.Sequential(
+                nn.Conv2d(inchannel, head_conv,
+                  kernel_size=3, padding=1, bias=True),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(head_conv, num_output, 
+                  kernel_size=1, stride=1, padding=0))
+          else:
+            inchannel = 256
+            fc = nn.Conv2d(
+              in_channels=inchannel,
+              out_channels=num_output,
+              kernel_size=1,
+              stride=1,
+              padding=0
+          )
+          self.__setattr__(head, fc)
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _get_deconv_cfg(self, deconv_kernel, index):
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        elif deconv_kernel == 7:
+            padding = 3
+            output_padding = 0
+
+        return deconv_kernel, padding, output_padding
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        assert num_layers == len(num_filters), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+        assert num_layers == len(num_kernels), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i], i)
+
+            planes = num_filters[i]
+            layers.append(
+                nn.ConvTranspose2d(
+                    in_channels=self.inplanes,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=self.deconv_with_bias))
+            layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
+            layers.append(nn.ReLU(inplace=True))
+            self.inplanes = planes
+
+        return nn.Sequential(*layers)
+    
+    def save_map(self,x,name):
+        path = '/home/rujiao.lrj/pytorch2caffe/save_map.txt'
+        f = open(path,'a+')
+        f.write('----------------------------------')
+        f.write('%s\n'%name)
+        for i in range(3):
+            for j in range(16):
+                 f.write('cln:%d,line:%d\n'%(i,j))
+                 string = ''
+                 for k in range(16):
+                     string = string + str(x[i][j][k]) + ' '
+                 f.write(string+'\n')
+
+    def forward(self, x):
+        x = pad_same(x,7,2)
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x0 = pad_same(x,3,2,True)
+        x0 = self.maxpool(x0)
+        x1 = self.layer1(x0)
+        x2 = self.layer2(x1)
+        x3 = self.layer3(x2)
+        x4 = self.layer4(x3)
+ 
+        x3_ = self.deconv_layers1(x4)
+        x3_ = self.adaption3(x3) + x3_
+        
+        x2_ = self.deconv_layers2(x3_)
+        x2_ = self.adaption2(x2) + x2_
+ 
+        x1_ = self.deconv_layers3(x2_)
+        x1_ = self.adaption1(x1) + x1_
+ 
+        x0_ = self.deconv_layers4(x1_) + self.adaption0(x0)
+        x0_ = self.adaptionU1(x0_)
+
+        ret = {}
+        for head in self.heads:
+            ret[head] = self.__getattr__(head)(x0_)
+        return [ret]
+
+    def init_weights(self, num_layers, pretrained=True):
+        if pretrained:
+            for deconv_layer in [self.deconv_layers1,self.deconv_layers2,self.deconv_layers3]:
+                for _, m in deconv_layer.named_modules():
+                    if isinstance(m, nn.ConvTranspose2d):
+                        nn.init.normal_(m.weight, std=0.001)
+                        if self.deconv_with_bias:
+                            nn.init.constant_(m.bias, 0)
+                    elif isinstance(m, nn.BatchNorm2d):
+                        nn.init.constant_(m.weight, 1)
+                        nn.init.constant_(m.bias, 0)
+                for head in self.heads:
+                  final_layer = self.__getattr__(head)
+                  for i, m in enumerate(final_layer.modules()):
+                      if isinstance(m, nn.Conv2d):
+                          if m.weight.shape[0] == self.heads[head]:
+                              if 'hm' in head:
+                                  nn.init.constant_(m.bias, -2.19)
+                              else:
+                                  nn.init.normal_(m.weight, std=0.001)
+                                  nn.init.constant_(m.bias, 0)
+            url = model_urls['resnet{}'.format(num_layers)]
+            pretrained_state_dict = model_zoo.load_url(url)
+            print('=> loading pretrained model {}'.format(url))
+            self.load_state_dict(pretrained_state_dict, strict=False)
+        else:
+            print('=> imagenet pretrained model dose not exist')
+            print('=> please download it first')
+            raise ValueError('imagenet pretrained model does not exist')
+
+
+resnet_spec = {18: (BasicBlock, [2, 2, 2, 2]),
+               34: (BasicBlock, [3, 4, 6, 3]),
+               50: (Bottleneck, [3, 4, 6, 3]),
+               101: (Bottleneck, [3, 4, 23, 3]),
+               152: (Bottleneck, [3, 8, 36, 3])}
+
+
+def get_pose_net_fpn_mask(num_layers, heads, head_conv):
+  block_class, layers = resnet_spec[num_layers]
+
+  model = PoseResNet(block_class, layers, heads, head_conv=head_conv)
+  model.init_weights(num_layers, pretrained=True)
+  return model
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_mask_resnet_half.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_mask_resnet_half.py
new file mode 100644
index 00000000..983cbbc1
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_mask_resnet_half.py
@@ -0,0 +1,460 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# Modified by Xingyi Zhou
+# ------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as model_zoo
+
+BN_MOMENTUM = 0.1
+
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=0, bias=False)
+
+def save_map_g(x,name):
+    path = '/home/rujiao.lrj/CenterNet_4point_Mask_4_rotate_offset/src/save_map.txt'
+    f = open(path,'a+')
+    f.write('---------------%s-------------------\n'%name)
+    shape = list(x.shape)
+    if shape[0]>3:
+        c=3 
+    else:
+        c=shape[0]
+    if shape[1]>16:
+        w=16
+    else:
+        w=shape[1]
+    if shape[2]>16:
+        h=16
+    else:
+        h=shape[2]
+    for i in range(c):
+        for j in range(w):
+             f.write('cln:%d,line:%d\n'%(i,j))
+             string = ''
+             for k in range(h):
+                 string = string + str(x[i][j][k]) + ' ' 
+             f.write(string+'\n')
+
+def pad_same(x,ksize,stride=1,Pool=False):
+    shape = x.shape
+    n_in,w,h = shape[1],shape[2],shape[3]
+    if h % stride == 0:
+        pad_along_height = max(ksize - stride, 0)
+    else:
+        pad_along_height = max(ksize - (h % stride), 0)
+    if w % stride == 0:
+        pad_along_width = max(ksize - stride, 0)
+    else:
+        pad_along_width = max(ksize - (w % stride), 0)
+    pad_bottom = pad_along_height // 2
+    pad_top = pad_along_height - pad_bottom
+    pad_right = pad_along_width // 2
+    pad_left = pad_along_width - pad_right
+    dim = (pad_left,pad_right,pad_top,pad_bottom)
+    if Pool:
+        dim = (pad_right,pad_left,pad_bottom,pad_top)
+    x = F.pad(x,dim,"constant",value=0)
+    return x
+
+class ChannelAttention(nn.Module):
+    def __init__(self, in_planes, ratio=16):
+        super(ChannelAttention, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.max_pool = nn.AdaptiveMaxPool2d(1)
+
+        self.fc1 = nn.Conv2d(in_planes, in_planes // ratio, 1, bias=False)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Conv2d(in_planes // ratio, in_planes, 1, bias=False)
+
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        avg_out = self.fc2(self.relu1(self.fc1(self.avg_pool(x))))
+        max_out = self.fc2(self.relu1(self.fc1(self.max_pool(x))))
+
+        out = avg_out + max_out
+
+        return self.sigmoid(out)
+
+class SpatialAttention(nn.Module):
+    def __init__(self):
+        super(SpatialAttention, self).__init__()
+
+        self.conv1 = nn.Conv2d(2,1,3,padding=1,bias=False)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        avg_out = torch.mean(x, dim=1, keepdim=True)
+        max_out,_ = torch.max(x, dim=1, keepdim=True)
+        x = torch.cat([avg_out,max_out],dim=1)
+        x = self.conv1(x)
+        return self.sigmoid(x)
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.downsample = downsample
+        self.stride = stride
+        self.planes = planes
+
+    def forward(self, x):
+        shape = list(x.shape)
+        residual = x
+        x = pad_same(x,3,self.stride)
+
+        out = self.conv1(x)
+        '''
+        if self.planes==64:
+            if self.downsample is not None:
+                save_map_g(out.cpu().numpy()[0],'layer1.0.conv1')
+            else:
+                save_map_g(out.cpu().numpy()[0],'layer1.1.conv1')
+        '''
+        out = self.bn1(out)
+        '''
+        if self.planes==64:
+            if self.downsample is not None:
+                save_map_g(out.cpu().numpy()[0],'layer1.0.bn1')
+            else:
+                save_map_g(out.cpu().numpy()[0],'layer1.1.bn1')
+        '''
+        out = self.relu(out)
+        '''
+        if self.planes==64:
+            if self.downsample is not None:
+                save_map_g(out.cpu().numpy()[0],'layer1.0.relu')
+            else:
+                save_map_g(out.cpu().numpy()[0],'layer1.1.relu')
+        '''
+        out = pad_same(out,3,1)
+        out = self.conv2(out)
+        '''
+        if self.planes==64:
+            if self.downsample is not None:
+                save_map_g(out.cpu().numpy()[0],'layer1.0.conv2')
+            else:
+                save_map_g(out.cpu().numpy()[0],'layer1.1.conv2')
+        '''
+        out = self.bn2(out)
+        '''
+        if self.planes==64:
+            if self.downsample is not None:
+                save_map_g(out.cpu().numpy()[0],'layer1.0.bn2')
+            else:
+                save_map_g(out.cpu().numpy()[0],'layer1.1.bn2')
+        '''
+        if self.downsample is not None:
+            residual = self.downsample(residual)
+
+        out += residual
+        out = self.relu(out)
+        '''
+        if self.planes==64:
+            if self.downsample is not None:
+                save_map_g(out.cpu().numpy()[0],'res2a.relu')
+            else:
+                save_map_g(out.cpu().numpy()[0],'res2b.relu')
+        '''
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
+                               bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
+                                  momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class PoseResNet(nn.Module):
+
+    def __init__(self, block, layers, heads, head_conv, **kwargs):
+        self.inplanes = 64
+        self.deconv_with_bias = False
+        self.heads = heads
+
+        super(PoseResNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=0,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=0)
+        self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 256, layers[3], stride=2)
+
+        self.adaption3 = nn.Conv2d(256,256, kernel_size=1, stride=1, padding=0,bias=False)
+        self.adaption2 = nn.Conv2d(128,256, kernel_size=1, stride=1, padding=0,bias=False)
+        self.adaption1 = nn.Conv2d(64 ,256, kernel_size=1, stride=1, padding=0,bias=False)
+        self.adaption0 = nn.Conv2d(64 ,256, kernel_size=1, stride=1, padding=0,bias=False)
+
+        self.adaptionU1 = nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0,bias=False)
+
+        # used for deconv layers
+        self.deconv_layers1 = self._make_deconv_layer(1, [256], [4],)
+        self.deconv_layers2 = self._make_deconv_layer(1, [256], [4],)
+        self.deconv_layers3 = self._make_deconv_layer(1, [256], [4],)
+        self.deconv_layers4 = self._make_deconv_layer(1, [256], [4],)
+
+        # self.final_layer = []
+
+        for head in sorted(self.heads):
+          num_output = self.heads[head]
+          if head_conv > 0:
+            inchannel = 256
+            fc = nn.Sequential(
+                nn.Conv2d(inchannel, head_conv,
+                  kernel_size=3, padding=1, bias=True),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(head_conv, num_output, 
+                  kernel_size=1, stride=1, padding=0))
+          else:
+            inchannel = 256
+            fc = nn.Conv2d(
+              in_channels=inchannel,
+              out_channels=num_output,
+              kernel_size=1,
+              stride=1,
+              padding=0
+          )
+          self.__setattr__(head, fc)
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _get_deconv_cfg(self, deconv_kernel, index):
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        elif deconv_kernel == 7:
+            padding = 3
+            output_padding = 0
+
+        return deconv_kernel, padding, output_padding
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        assert num_layers == len(num_filters), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+        assert num_layers == len(num_kernels), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i], i)
+
+            planes = num_filters[i]
+            layers.append(
+                nn.ConvTranspose2d(
+                    in_channels=self.inplanes,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=self.deconv_with_bias))
+            layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
+            layers.append(nn.ReLU(inplace=True))
+            self.inplanes = planes
+
+        return nn.Sequential(*layers)
+    
+    def save_map(self,x,name):
+        path = '/home/rujiao.lrj/CenterNet_4point_Mask_4_rotate_offset/src/save_map.txt'
+        f = open(path,'a+')
+        f.write('---------------%s-------------------\n'%name)
+        shape = list(x.shape)
+        if shape[0]>3:
+            c=3 
+        else:
+            c=shape[0]
+        if shape[1]>16:
+            w=16
+        else:
+            w=shape[1]
+        if shape[2]>16:
+            h=16
+        else:
+            h=shape[2]
+        for i in range(c):
+            for j in range(w):
+                 f.write('cln:%d,line:%d\n'%(i,j))
+                 string = ''
+                 for k in range(h):
+                     string = string + str(x[i][j][k]) + ' ' 
+                 f.write(string+'\n')
+
+    def forward(self, x):
+        #self.save_map(x.cpu().numpy()[0],'input')
+        x = pad_same(x,7,2)
+        x = self.conv1(x)
+        #self.save_map(x.cpu().numpy()[0],'conv1')
+        x = self.bn1(x)
+        #self.save_map(x.cpu().numpy()[0],'bn1')
+        x = self.relu(x)
+        #self.save_map(x.cpu().numpy()[0],'conv1.relu')
+        x0 = pad_same(x,3,2,True)
+        x0 = self.maxpool(x0)
+        #self.save_map(x0.cpu().numpy()[0],'pool1')
+        x1 = self.layer1(x0)
+        #self.save_map(x1.cpu().numpy()[0],'res2b.relu')
+        x2 = self.layer2(x1)
+        #self.save_map(x2.cpu().numpy()[0],'res3b.relu')
+        x3 = self.layer3(x2)
+        #self.save_map(x3.cpu().numpy()[0],'res4b.relu')
+        x4 = self.layer4(x3)
+        #self.save_map(x4.cpu().numpy()[0],'res5b.relu')
+ 
+        x3_ = self.deconv_layers1(x4)
+        #self.save_map(x3_.cpu().numpy()[0],'deconv_layers1.0')
+        x3_ = self.adaption3(x3) + x3_
+        #self.save_map(x3_.cpu().numpy()[0],'res4b_')
+      
+        x2_ = self.deconv_layers2(x3_)
+        #self.save_map(x2_.cpu().numpy()[0],'deconv_layers2.0')
+        x2_ = self.adaption2(x2) + x2_
+        #self.save_map(x2_.cpu().numpy()[0],'res3b_')
+ 
+        x1_ = self.deconv_layers3(x2_)
+        #self.save_map(x1_.cpu().numpy()[0],'deconv_layers3.0')
+        x1_ = self.adaption1(x1) + x1_
+        #self.save_map(x1_.cpu().numpy()[0],'res2b_')
+ 
+        x0_ = self.deconv_layers4(x1_) + self.adaption0(x0)
+        x0_ = self.adaptionU1(x0_)
+        #self.save_map(x0_.cpu().numpy()[0],'adaptionU1')
+
+        ret = {}
+        for head in self.heads:
+            ret[head] = self.__getattr__(head)(x0_)
+        return [ret]
+
+        #for head in self.heads:
+        #   self.save_map(ret[head].cpu().numpy()[0],head)
+
+    def init_weights(self, num_layers, pretrained=True):
+        if pretrained:
+            for deconv_layer in [self.deconv_layers1,self.deconv_layers2,self.deconv_layers3]:
+                for _, m in deconv_layer.named_modules():
+                    if isinstance(m, nn.ConvTranspose2d):
+                        nn.init.normal_(m.weight, std=0.001)
+                        if self.deconv_with_bias:
+                            nn.init.constant_(m.bias, 0)
+                    elif isinstance(m, nn.BatchNorm2d):
+                        nn.init.constant_(m.weight, 1)
+                        nn.init.constant_(m.bias, 0)
+                for head in self.heads:
+                  final_layer = self.__getattr__(head)
+                  for i, m in enumerate(final_layer.modules()):
+                      if isinstance(m, nn.Conv2d):
+                          if m.weight.shape[0] == self.heads[head]:
+                              if 'hm' in head:
+                                  nn.init.constant_(m.bias, -2.19)
+                              else:
+                                  nn.init.normal_(m.weight, std=0.001)
+                                  nn.init.constant_(m.bias, 0)
+            url = model_urls['resnet{}'.format(num_layers)]
+            pretrained_state_dict = model_zoo.load_url(url)
+            print('=> loading pretrained model {}'.format(url))
+            #self.load_state_dict(pretrained_state_dict, strict=False)
+        else:
+            print('=> imagenet pretrained model dose not exist')
+            print('=> please download it first')
+            raise ValueError('imagenet pretrained model does not exist')
+
+
+resnet_spec = {18: (BasicBlock, [2, 2, 2, 2]),
+               34: (BasicBlock, [3, 4, 6, 3]),
+               50: (Bottleneck, [3, 4, 6, 3]),
+               101: (Bottleneck, [3, 4, 23, 3]),
+               152: (Bottleneck, [3, 8, 36, 3])}
+
+
+def get_pose_net_fpn_mask_half(num_layers, heads, head_conv):
+  block_class, layers = resnet_spec[num_layers]
+
+  model = PoseResNet(block_class, layers, heads, head_conv=head_conv)
+  model.init_weights(num_layers, pretrained=True)
+  return model
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_resnet.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_resnet.py
new file mode 100644
index 00000000..0abb92e9
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_resnet.py
@@ -0,0 +1,279 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# Modified by Xingyi Zhou
+# ------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import torch
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo
+
+BN_MOMENTUM = 0.1
+
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
+                               bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
+                                  momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class PoseResNet(nn.Module):
+
+    def __init__(self, block, layers, heads, head_conv, **kwargs):
+        self.inplanes = 64
+        self.deconv_with_bias = False
+        self.heads = heads
+
+        super(PoseResNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.adaption3 = nn.Conv2d(256,256, kernel_size=3, stride=1, padding=1,bias=False)
+        self.adaption2 = nn.Conv2d(128,256, kernel_size=3, stride=1, padding=1,bias=False)
+        self.adaption1 = nn.Conv2d(64 ,256, kernel_size=3, stride=1, padding=1,bias=False)
+
+        # used for deconv layers
+        self.deconv_layers1 = self._make_deconv_layer(1, [256], [4],)
+        self.deconv_layers2 = self._make_deconv_layer(1, [256], [4],)
+        self.deconv_layers3 = self._make_deconv_layer(1, [256], [4],)
+
+        # self.final_layer = []
+
+        for head in sorted(self.heads):
+          num_output = self.heads[head]
+          if head_conv > 0:
+            fc = nn.Sequential(
+                nn.Conv2d(256, head_conv,
+                  kernel_size=3, padding=1, bias=True),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(head_conv, num_output, 
+                  kernel_size=1, stride=1, padding=0))
+          else:
+            fc = nn.Conv2d(
+              in_channels=256,
+              out_channels=num_output,
+              kernel_size=1,
+              stride=1,
+              padding=0
+          )
+          self.__setattr__(head, fc)
+
+        # self.final_layer = nn.ModuleList(self.final_layer)
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _get_deconv_cfg(self, deconv_kernel, index):
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+
+        return deconv_kernel, padding, output_padding
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        assert num_layers == len(num_filters), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+        assert num_layers == len(num_kernels), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i], i)
+
+            planes = num_filters[i]
+            layers.append(
+                nn.ConvTranspose2d(
+                    in_channels=self.inplanes,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=self.deconv_with_bias))
+            layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
+            layers.append(nn.ReLU(inplace=True))
+            self.inplanes = planes
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x1 = self.layer1(x)
+        x2 = self.layer2(x1)
+        x3 = self.layer3(x2)
+        x4 = self.layer4(x3)
+
+        x3_ = self.deconv_layers1(x4)
+        x3_ = self.adaption3(x3) + x3_
+        x2_ = self.deconv_layers2(x3_)
+        x2_ = self.adaption2(x2) + x2_
+        x1_ = self.deconv_layers3(x2_)
+        x   = self.adaption1(x1) + x1_
+        #x   = x1_
+        ret = {}
+        for head in self.heads:
+            ret[head] = self.__getattr__(head)(x)
+        return [ret]
+
+    def init_weights(self, num_layers, pretrained=True):
+        if pretrained:
+            for deconv_layer in [self.deconv_layers1,self.deconv_layers2,self.deconv_layers3]:
+                for _, m in deconv_layer.named_modules():
+                    if isinstance(m, nn.ConvTranspose2d):
+                        nn.init.normal_(m.weight, std=0.001)
+                        if self.deconv_with_bias:
+                            nn.init.constant_(m.bias, 0)
+                    elif isinstance(m, nn.BatchNorm2d):
+                        nn.init.constant_(m.weight, 1)
+                        nn.init.constant_(m.bias, 0)
+                for head in self.heads:
+                  final_layer = self.__getattr__(head)
+                  for i, m in enumerate(final_layer.modules()):
+                      if isinstance(m, nn.Conv2d):
+                          if m.weight.shape[0] == self.heads[head]:
+                              if 'hm' in head:
+                                  nn.init.constant_(m.bias, -2.19)
+                              else:
+                                  nn.init.normal_(m.weight, std=0.001)
+                                  nn.init.constant_(m.bias, 0)
+            url = model_urls['resnet{}'.format(num_layers)]
+            pretrained_state_dict = model_zoo.load_url(url)
+            print('=> loading pretrained model {}'.format(url))
+            self.load_state_dict(pretrained_state_dict, strict=False)
+        else:
+            print('=> imagenet pretrained model dose not exist')
+            print('=> please download it first')
+            raise ValueError('imagenet pretrained model does not exist')
+
+
+resnet_spec = {18: (BasicBlock, [2, 2, 2, 2]),
+               34: (BasicBlock, [3, 4, 6, 3]),
+               50: (Bottleneck, [3, 4, 6, 3]),
+               101: (Bottleneck, [3, 4, 23, 3]),
+               152: (Bottleneck, [3, 8, 36, 3])}
+
+
+def get_pose_net_fpn(num_layers, heads, head_conv):
+  block_class, layers = resnet_spec[num_layers]
+
+  model = PoseResNet(block_class, layers, heads, head_conv=head_conv)
+  model.init_weights(num_layers, pretrained=True)
+  return model
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_resnet_half.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_resnet_half.py
new file mode 100644
index 00000000..4b6acdad
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/fpn_resnet_half.py
@@ -0,0 +1,403 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# Modified by Xingyi Zhou
+# ------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as model_zoo
+
+BN_MOMENTUM = 0.1
+
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=0, bias=False)
+
+class ChannelAttention(nn.Module):
+    def __init__(self, in_planes, ratio=16):
+        super(ChannelAttention, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.max_pool = nn.AdaptiveMaxPool2d(1)
+
+        self.fc1 = nn.Conv2d(in_planes, in_planes // ratio, 1, bias=False)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Conv2d(in_planes // ratio, in_planes, 1, bias=False)
+
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        avg_out = self.fc2(self.relu1(self.fc1(self.avg_pool(x))))
+        max_out = self.fc2(self.relu1(self.fc1(self.max_pool(x))))
+
+        out = avg_out + max_out
+
+        return self.sigmoid(out)
+
+class SpatialAttention(nn.Module):
+    def __init__(self):
+        super(SpatialAttention, self).__init__()
+
+        self.conv1 = nn.Conv2d(2,1,3,padding=1,bias=False)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        avg_out = torch.mean(x, dim=1, keepdim=True)
+        max_out,_ = torch.max(x, dim=1, keepdim=True)
+        x = torch.cat([avg_out,max_out],dim=1)
+        x = self.conv1(x)
+        return self.sigmoid(x)
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3,stride=stride, padding=1)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.downsample = downsample
+        self.stride = stride
+        self.planes = planes
+
+    def forward(self, x):
+        shape = list(x.shape)
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(residual)
+
+        out += residual
+        out = self.relu(out)
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
+                               bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
+                                  momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class PoseResNet(nn.Module):
+
+    def __init__(self, block, layers, heads, head_conv, **kwargs):
+        self.inplanes = 64
+        self.deconv_with_bias = False
+        self.heads = heads
+
+        super(PoseResNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 256, layers[3], stride=2)
+
+        self.adaption3 = nn.Conv2d(256,256, kernel_size=1, stride=1, padding=0,bias=False)
+        self.adaption2 = nn.Conv2d(128,256, kernel_size=1, stride=1, padding=0,bias=False)
+        self.adaption1 = nn.Conv2d(64 ,256, kernel_size=1, stride=1, padding=0,bias=False)
+        self.adaption0 = nn.Conv2d(64 ,256, kernel_size=1, stride=1, padding=0,bias=False)
+
+        self.adaptionU1 = nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0,bias=False)
+
+        # used for deconv layers
+        self.deconv_layers1 = self._make_deconv_layer(1, [256], [4],) #nn.Upsample(scale_factor=2, mode='bilinear',align_corners=False)
+        self.deconv_layers2 = self._make_deconv_layer(1, [256], [4],) #nn.Upsample(scale_factor=2, mode='bilinear',align_corners=False)#
+        self.deconv_layers3 = self._make_deconv_layer(1, [256], [4],) #nn.Upsample(scale_factor=2, mode='bilinear',align_corners=False)#
+        self.deconv_layers4 = self._make_deconv_layer(1, [256], [4],) #nn.Upsample(scale_factor=2, mode='bilinear',align_corners=False)#
+
+        self.hm_maxpool = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
+        self.hm_sigmoid = nn.Sigmoid()
+        self.mk_maxpool = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
+        self.mk_sigmoid = nn.Sigmoid()
+        #self.hm_sig_for_pool = nn.Sigmoid()
+       
+        #self.cls = nn.Sequential(nn.Conv2d(256, 64,kernel_size=3, padding=1, bias=True),
+        #                              nn.ReLU(inplace=True),
+        #                              nn.Conv2d(64, 4, kernel_size=1, stride=1, padding=0))
+
+        # self.final_layer = []
+        for head in sorted(self.heads):
+          num_output = self.heads[head]
+          if head_conv > 0 and (head=='reg' or head=='mk_reg'):
+            inchannel = 256
+            fc = nn.Sequential(
+                nn.Conv2d(inchannel, head_conv,
+                  kernel_size=3, padding=1, bias=True),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(head_conv, num_output, 
+                  kernel_size=1, stride=1, padding=0))
+          elif head_conv > 0:
+            inchannel = 256
+            fc = nn.Sequential(
+                nn.Conv2d(inchannel, head_conv, kernel_size=3, padding=1, bias=True),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(head_conv, head_conv, kernel_size=3, padding=1, bias=True),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(head_conv, head_conv, kernel_size=3, padding=1, bias=True),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(head_conv, head_conv, kernel_size=3, padding=1, bias=True),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(head_conv, num_output,kernel_size=1, stride=1, padding=0))
+          else:
+            inchannel = 256
+            fc = nn.Conv2d(
+              in_channels=inchannel,
+              out_channels=num_output,
+              kernel_size=1,
+              stride=1,
+              padding=0
+          )
+          self.__setattr__(head, fc)
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _get_deconv_cfg(self, deconv_kernel, index):
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        elif deconv_kernel == 7:
+            padding = 3
+            output_padding = 0
+
+        return deconv_kernel, padding, output_padding
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        assert num_layers == len(num_filters), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+        assert num_layers == len(num_kernels), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i], i)
+
+            planes = num_filters[i]
+            layers.append(
+                nn.ConvTranspose2d(
+                    in_channels=self.inplanes,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=self.deconv_with_bias))
+            layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
+            layers.append(nn.ReLU(inplace=True))
+            self.inplanes = planes
+
+        return nn.Sequential(*layers)
+    
+    def save_map(self,X,name):
+        x = X[0].data.cpu().numpy()
+        print(x.shape)
+        path = '/home/rujiao.lrj/CenterNet_huntie/src/%s.txt'%name
+        f = open(path,'w')
+        #f.write('---------------%s-------------------\n'%name)
+        f.write(name+'\n')
+        f.write(str(1)+' '+str(x.shape[0])+' '+str(x.shape[1])+' '+str(x.shape[2])+'\n')
+        shape = list(x.shape)
+        if shape[0]>3:
+            c=3 
+        else:
+            c=shape[0]
+        if shape[1]>16:
+            w=16
+        else:
+            w=shape[1]
+        if shape[2]>16:
+            h=16
+        else:
+            h=shape[2]
+        c,h,w = x.shape
+        for i in range(c):
+            for j in range(h):
+                 #f.write('cln:%d,line:%d\n'%(i,j))
+                 #string = ''
+                 for k in range(w):
+                     #string = string + str(x[i][j][k]) + ' ' 
+                     f.write(str(x[i][j][k])+' ')
+                 #f.write(string+'\n')
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x0 = self.maxpool(x)
+        x1 = self.layer1(x0)
+        x2 = self.layer2(x1) 
+        x3 = self.layer3(x2)
+        x4 = self.layer4(x3)
+ 
+        x3_ = self.deconv_layers1(x4)
+        x3_ = self.adaption3(x3) + x3_
+      
+        x2_ = self.deconv_layers2(x3_)
+        x2_ = self.adaption2(x2) + x2_
+ 
+        x1_ = self.deconv_layers3(x2_)
+        x1_ = self.adaption1(x1) + x1_
+ 
+        x0_ = self.deconv_layers4(x1_) + self.adaption0(x0)
+        x0_ = self.adaptionU1(x0_)
+
+        ret = {}
+
+        # leave it alone is ok
+        #xcls = x0_
+        #for name, midlayer in self.cls._modules.items():
+        #    xcls = midlayer(xcls)
+        #ret['cls'] = xcls
+
+        #training version
+        for head in self.heads:
+            ret[head] = self.__getattr__(head)(x0_)
+
+        #onnx version
+        # hm = self.__getattr__('hm')(x0_)
+        # wh = self.__getattr__('wh')(x0_)
+        # st = self.__getattr__('st')(x0_)
+        # reg = self.__getattr__('reg')(x0_)
+        # cr = self.__getattr__('cr')(x0_)
+        # ax = self.__getattr__('ax')(x0_)
+        
+
+        # ret['hm_sigmoid'] = self.hm_sigmoid(hm)
+        # ret['hm_maxpool'] = self.hm_maxpool(ret['hm_sigmoid'])
+        # ret['wh'] = wh
+        # ret['st'] = st
+        # ret['ax'] = ax
+        # ret['cr']= cr
+        # ret['reg']= reg
+        
+        
+        return [ret]#ret['hm'], ret['st'], ret['wh'], ret['ax'], ret['cr'], ret['reg']#[ret]
+
+    def init_weights(self, num_layers, pretrained=True):
+        if pretrained:
+            for deconv_layer in [self.deconv_layers1,self.deconv_layers2,self.deconv_layers3]:
+                for _, m in deconv_layer.named_modules():
+                    if isinstance(m, nn.ConvTranspose2d):
+                        nn.init.normal_(m.weight, std=0.001)
+                        if self.deconv_with_bias:
+                            nn.init.constant_(m.bias, 0)
+                    elif isinstance(m, nn.BatchNorm2d):
+                        nn.init.constant_(m.weight, 1)
+                        nn.init.constant_(m.bias, 0)
+                for head in self.heads:
+                  final_layer = self.__getattr__(head)
+                  for i, m in enumerate(final_layer.modules()):
+                      if isinstance(m, nn.Conv2d):
+                          if m.weight.shape[0] == self.heads[head]:
+                              if 'hm' in head:
+                                  nn.init.constant_(m.bias, -2.19)
+                              else:
+                                  nn.init.normal_(m.weight, std=0.001)
+                                  nn.init.constant_(m.bias, 0)
+            url = model_urls['resnet{}'.format(num_layers)]
+            #pretrained_state_dict = model_zoo.load_url(url)
+            #print('=> loading pretrained model {}'.format(url))
+            #self.load_state_dict(pretrained_state_dict, strict=False)
+        else:
+            print('=> imagenet pretrained model dose not exist')
+            print('=> please download it first')
+            raise ValueError('imagenet pretrained model does not exist')
+
+
+resnet_spec = {18: (BasicBlock, [2, 2, 2, 2]),
+               34: (BasicBlock, [3, 4, 6, 3]),
+               50: (Bottleneck, [3, 4, 6, 3]),
+               101: (Bottleneck, [3, 4, 23, 3]),
+               152: (Bottleneck, [3, 8, 36, 3])}
+
+
+def get_pose_net_fpn_half(num_layers, heads, head_conv):
+  block_class, layers = resnet_spec[num_layers]
+
+  model = PoseResNet(block_class, layers, heads, head_conv=head_conv)
+  model.init_weights(num_layers, pretrained=True)
+  return model
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/pose_dla_dcn.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/pose_dla_dcn.py
new file mode 100644
index 00000000..0f761776
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/pose_dla_dcn.py
@@ -0,0 +1,559 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import math
+import logging
+import numpy as np
+from os.path import join
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as model_zoo
+import torchvision.ops
+from torch.nn.modules.utils import _pair
+
+# from .DCNv2.dcn_v2 import DCN
+#from .dcn.modules.deform_conv import ModulatedDeformConvPack as DCN
+# from mmcv.ops import ModulatedDeformConv2dPack as DCN
+BN_MOMENTUM = 0.1
+logger = logging.getLogger(__name__)
+
+def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'):
+    return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash))
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3,
+                               stride=stride, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
+                               stride=1, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 2
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(Bottleneck, self).__init__()
+        expansion = Bottleneck.expansion
+        bottle_planes = planes // expansion
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes,
+                               kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
+                               stride=stride, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(bottle_planes, planes,
+                               kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class BottleneckX(nn.Module):
+    expansion = 2
+    cardinality = 32
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BottleneckX, self).__init__()
+        cardinality = BottleneckX.cardinality
+        # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0)))
+        # bottle_planes = dim * cardinality
+        bottle_planes = planes * cardinality // 32
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes,
+                               kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
+                               stride=stride, padding=dilation, bias=False,
+                               dilation=dilation, groups=cardinality)
+        self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(bottle_planes, planes,
+                               kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Root(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, residual):
+        super(Root, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels, out_channels, 1,
+            stride=1, bias=False, padding=(kernel_size - 1) // 2)
+        self.bn = nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.residual = residual
+
+    def forward(self, *x):
+        children = x
+        x = self.conv(torch.cat(x, 1))
+        x = self.bn(x)
+        if self.residual:
+            x += children[0]
+        x = self.relu(x)
+
+        return x
+
+
+class Tree(nn.Module):
+    def __init__(self, levels, block, in_channels, out_channels, stride=1,
+                 level_root=False, root_dim=0, root_kernel_size=1,
+                 dilation=1, root_residual=False):
+        super(Tree, self).__init__()
+        if root_dim == 0:
+            root_dim = 2 * out_channels
+        if level_root:
+            root_dim += in_channels
+        if levels == 1:
+            self.tree1 = block(in_channels, out_channels, stride,
+                               dilation=dilation)
+            self.tree2 = block(out_channels, out_channels, 1,
+                               dilation=dilation)
+        else:
+            self.tree1 = Tree(levels - 1, block, in_channels, out_channels,
+                              stride, root_dim=0,
+                              root_kernel_size=root_kernel_size,
+                              dilation=dilation, root_residual=root_residual)
+            self.tree2 = Tree(levels - 1, block, out_channels, out_channels,
+                              root_dim=root_dim + out_channels,
+                              root_kernel_size=root_kernel_size,
+                              dilation=dilation, root_residual=root_residual)
+        if levels == 1:
+            self.root = Root(root_dim, out_channels, root_kernel_size,
+                             root_residual)
+        self.level_root = level_root
+        self.root_dim = root_dim
+        self.downsample = None
+        self.project = None
+        self.levels = levels
+        if stride > 1:
+            self.downsample = nn.MaxPool2d(stride, stride=stride)
+        if in_channels != out_channels:
+            self.project = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels,
+                          kernel_size=1, stride=1, bias=False),
+                nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM)
+            )
+
+    def forward(self, x, residual=None, children=None):
+        children = [] if children is None else children
+        bottom = self.downsample(x) if self.downsample else x
+        residual = self.project(bottom) if self.project else bottom
+        if self.level_root:
+            children.append(bottom)
+        x1 = self.tree1(x, residual)
+        if self.levels == 1:
+            x2 = self.tree2(x1)
+            x = self.root(x2, x1, *children)
+        else:
+            children.append(x1)
+            x = self.tree2(x1, children=children)
+        return x
+
+
+class DLA(nn.Module):
+    def __init__(self, levels, channels, num_classes=1000,
+                 block=BasicBlock, residual_root=False, linear_root=False):
+        super(DLA, self).__init__()
+        self.channels = channels
+        self.num_classes = num_classes
+        self.base_layer = nn.Sequential(
+            nn.Conv2d(3, channels[0], kernel_size=7, stride=1,
+                      padding=3, bias=False),
+            nn.BatchNorm2d(channels[0], momentum=BN_MOMENTUM),
+            nn.ReLU(inplace=True))
+        self.level0 = self._make_conv_level(
+            channels[0], channels[0], levels[0])
+        self.level1 = self._make_conv_level(
+            channels[0], channels[1], levels[1], stride=2)
+        self.level2 = Tree(levels[2], block, channels[1], channels[2], 2,
+                           level_root=False,
+                           root_residual=residual_root)
+        self.level3 = Tree(levels[3], block, channels[2], channels[3], 2,
+                           level_root=True, root_residual=residual_root)
+        self.level4 = Tree(levels[4], block, channels[3], channels[4], 2,
+                           level_root=True, root_residual=residual_root)
+        self.level5 = Tree(levels[5], block, channels[4], channels[5], 2,
+                           level_root=True, root_residual=residual_root)
+
+        # for m in self.modules():
+        #     if isinstance(m, nn.Conv2d):
+        #         n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        #         m.weight.data.normal_(0, math.sqrt(2. / n))
+        #     elif isinstance(m, nn.BatchNorm2d):
+        #         m.weight.data.fill_(1)
+        #         m.bias.data.zero_()
+
+    def _make_level(self, block, inplanes, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes:
+            downsample = nn.Sequential(
+                nn.MaxPool2d(stride, stride=stride),
+                nn.Conv2d(inplanes, planes,
+                          kernel_size=1, stride=1, bias=False),
+                nn.BatchNorm2d(planes, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(block(inplanes, planes, stride, downsample=downsample))
+        for i in range(1, blocks):
+            layers.append(block(inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
+        modules = []
+        for i in range(convs):
+            modules.extend([
+                nn.Conv2d(inplanes, planes, kernel_size=3,
+                          stride=stride if i == 0 else 1,
+                          padding=dilation, bias=False, dilation=dilation),
+                nn.BatchNorm2d(planes, momentum=BN_MOMENTUM),
+                nn.ReLU(inplace=True)])
+            inplanes = planes
+        return nn.Sequential(*modules)
+
+    def forward(self, x):
+        y = []
+        x = self.base_layer(x)
+        for i in range(6):
+            x = getattr(self, 'level{}'.format(i))(x)
+            y.append(x)
+        return y
+
+    def load_pretrained_model(self, data='imagenet', name='dla34', hash='ba72cf86'):
+        # fc = self.fc
+        print('****load backbone')
+        if name.endswith('.pth'):
+            model_weights = torch.load(data + name)
+        else:
+            model_url = get_model_url(data, name, hash)
+            model_weights = model_zoo.load_url(model_url)
+        num_classes = len(model_weights[list(model_weights.keys())[-1]])
+        self.fc = nn.Conv2d(
+            self.channels[-1], num_classes,
+            kernel_size=1, stride=1, padding=0, bias=True)
+        self.load_state_dict(model_weights)
+        # self.fc = fc
+
+
+def dla34(pretrained=True, **kwargs):  # DLA-34
+    model = DLA([1, 1, 1, 2, 2, 1],
+                [16, 32, 64, 128, 256, 512],
+                block=BasicBlock, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla34', hash='ba72cf86')
+    return model
+
+class Identity(nn.Module):
+
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+def fill_fc_weights(layers):
+    for m in layers.modules():
+        if isinstance(m, nn.Conv2d):
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+
+def fill_up_weights(up):
+    w = up.weight.data
+    f = math.ceil(w.size(2) / 2)
+    c = (2 * f - 1 - f % 2) / (2. * f)
+    for i in range(w.size(2)):
+        for j in range(w.size(3)):
+            w[0, 0, i, j] = \
+                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
+    for c in range(1, w.size(0)):
+        w[c, 0, :, :] = w[0, 0, :, :]
+
+
+class DeformConv(nn.Module):
+    def __init__(self, chi, cho):
+        super(DeformConv, self).__init__()
+        self.actf = nn.Sequential(
+            nn.BatchNorm2d(cho, momentum=BN_MOMENTUM),
+            nn.ReLU(inplace=True)
+        )
+        self.conv = DCN(chi, cho, kernel_size=(3,3), stride=1, padding=1, dilation=1, deformable_groups=1)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.actf(x)
+        return x
+
+
+class DCN(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 dilation,
+                 deformable_groups,
+                 ):
+
+        super(DCN, self).__init__()
+
+        self.in_channels = in_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.deformable_groups = deformable_groups
+        
+        self.conv_offset_mask = nn.Conv2d(in_channels, 
+                                          3 * self.kernel_size[0] * self.kernel_size[1],
+                                          kernel_size=self.kernel_size, 
+                                          stride=self.stride,
+                                          padding=self.padding, 
+                                          bias=True)
+
+        nn.init.constant_(self.conv_offset_mask.weight, 0.)
+        nn.init.constant_(self.conv_offset_mask.bias, 0.)
+
+        self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels, *self.kernel_size))
+        self.bias = nn.Parameter(torch.Tensor(out_channels))
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1.0 / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+        self.bias.data.zero_()
+
+    def forward(self, x):
+
+        out = self.conv_offset_mask(x)
+        o1, o2, mask = torch.chunk(out, 3, dim=1)
+        offset = torch.cat((o1, o2), dim=1)
+        mask = torch.sigmoid(mask)
+        
+        x = torchvision.ops.deform_conv2d(input=x, 
+                                          offset=offset, 
+                                          weight=self.weight, 
+                                          bias=self.bias, 
+                                          padding=self.padding,
+                                          stride=self.stride,
+                                          mask=mask,
+                                          )
+        return x
+    
+
+class IDAUp(nn.Module):
+
+    def __init__(self, o, channels, up_f):
+        super(IDAUp, self).__init__()
+        for i in range(1, len(channels)):
+            c = channels[i]
+            f = int(up_f[i])  
+            proj = DeformConv(c, o)
+            node = DeformConv(o, o)
+     
+            up = nn.ConvTranspose2d(o, o, f * 2, stride=f, 
+                                    padding=f // 2, output_padding=0,
+                                    groups=o, bias=False)
+            fill_up_weights(up)
+
+            setattr(self, 'proj_' + str(i), proj)
+            setattr(self, 'up_' + str(i), up)
+            setattr(self, 'node_' + str(i), node)
+                 
+        
+    def forward(self, layers, startp, endp):
+        for i in range(startp + 1, endp):
+            upsample = getattr(self, 'up_' + str(i - startp))
+            project = getattr(self, 'proj_' + str(i - startp))
+            layers[i] = upsample(project(layers[i]))
+            node = getattr(self, 'node_' + str(i - startp))
+            layers[i] = node(layers[i] + layers[i - 1])
+
+
+
+class DLAUp(nn.Module):
+    def __init__(self, startp, channels, scales, in_channels=None):
+        super(DLAUp, self).__init__()
+        self.startp = startp
+        if in_channels is None:
+            in_channels = channels
+        self.channels = channels
+        channels = list(channels)
+        scales = np.array(scales, dtype=int)
+        for i in range(len(channels) - 1):
+            j = -i - 2
+            setattr(self, 'ida_{}'.format(i),
+                    IDAUp(channels[j], in_channels[j:],
+                          scales[j:] // scales[j]))
+            scales[j + 1:] = scales[j]
+            in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]]
+
+    def forward(self, layers):
+        out = [layers[-1]] # start with 32
+        for i in range(len(layers) - self.startp - 1):
+            ida = getattr(self, 'ida_{}'.format(i))
+            ida(layers, len(layers) -i - 2, len(layers))
+            out.insert(0, layers[-1])
+        return out
+
+
+class Interpolate(nn.Module):
+    def __init__(self, scale, mode):
+        super(Interpolate, self).__init__()
+        self.scale = scale
+        self.mode = mode
+        
+    def forward(self, x):
+        x = F.interpolate(x, scale_factor=self.scale, mode=self.mode, align_corners=False)
+        return x
+
+
+class DLASeg(nn.Module):
+    def __init__(self, base_name, heads, pretrained, down_ratio, final_kernel,
+                 last_level, head_conv, out_channel=0):
+        super(DLASeg, self).__init__()
+        assert down_ratio in [2, 4, 8, 16]
+        self.first_level = int(np.log2(down_ratio))
+        self.last_level = last_level
+        print('****identify network')
+        self.base = globals()[base_name](pretrained=pretrained)
+        channels = self.base.channels
+        scales = [2 ** i for i in range(len(channels[self.first_level:]))]
+        self.dla_up = DLAUp(self.first_level, channels[self.first_level:], scales)
+
+        if out_channel == 0:
+            out_channel = channels[self.first_level]
+
+        self.ida_up = IDAUp(out_channel, channels[self.first_level:self.last_level], 
+                            [2 ** i for i in range(self.last_level - self.first_level)])
+        
+        self.heads = heads
+        for head in self.heads:
+            classes = self.heads[head]
+            if head_conv > 0:
+              fc = nn.Sequential(
+                  nn.Conv2d(channels[self.first_level], head_conv,
+                    kernel_size=3, padding=1, bias=True),
+                  nn.ReLU(inplace=True),
+                  nn.Conv2d(head_conv, classes, 
+                    kernel_size=final_kernel, stride=1, 
+                    padding=final_kernel // 2, bias=True))
+              if 'hm' in head:
+                fc[-1].bias.data.fill_(-2.19)
+              else:
+                fill_fc_weights(fc)
+            else:
+              fc = nn.Conv2d(channels[self.first_level], classes, 
+                  kernel_size=final_kernel, stride=1, 
+                  padding=final_kernel // 2, bias=True)
+              if 'hm' in head:
+                fc.bias.data.fill_(-2.19)
+              else:
+                fill_fc_weights(fc)
+            self.__setattr__(head, fc)
+
+    def forward(self, x):
+        x = self.base(x)
+        x = self.dla_up(x)
+
+        y = []
+        for i in range(self.last_level - self.first_level):
+            y.append(x[i].clone())
+        self.ida_up(y, 0, len(y))
+       
+        z = {}
+        for head in self.heads:
+            z[head] = self.__getattr__(head)(y[-1])
+        return z['hm'], z['st'], z['wh'], z['ax'], z['cr'], z['reg']#, y[-1]
+    
+
+def get_pose_net(num_layers, heads, head_conv=256, down_ratio=4):
+  model = DLASeg('dla{}'.format(num_layers), heads,
+                 pretrained=True,
+                 down_ratio=down_ratio,
+                 final_kernel=1,
+                 last_level=5,
+                 head_conv=head_conv)
+  return model
+
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/resnet_dcn.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/resnet_dcn.py
new file mode 100644
index 00000000..805c2c3b
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/networks/resnet_dcn.py
@@ -0,0 +1,290 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# Modified by Dequan Wang and Xingyi Zhou
+# ------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from .DCNv2.dcn_v2 import DCN
+import torch.utils.model_zoo as model_zoo
+
+BN_MOMENTUM = 0.1
+logger = logging.getLogger(__name__)
+
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
+                               bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
+                                  momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+def fill_up_weights(up):
+    w = up.weight.data
+    f = math.ceil(w.size(2) / 2)
+    c = (2 * f - 1 - f % 2) / (2. * f)
+    for i in range(w.size(2)):
+        for j in range(w.size(3)):
+            w[0, 0, i, j] = \
+                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
+    for c in range(1, w.size(0)):
+        w[c, 0, :, :] = w[0, 0, :, :] 
+
+def fill_fc_weights(layers):
+    for m in layers.modules():
+        if isinstance(m, nn.Conv2d):
+            nn.init.normal_(m.weight, std=0.001)
+            # torch.nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu')
+            # torch.nn.init.xavier_normal_(m.weight.data)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+class PoseResNet(nn.Module):
+
+    def __init__(self, block, layers, heads, head_conv):
+        self.inplanes = 64
+        self.heads = heads
+        self.deconv_with_bias = False
+
+        super(PoseResNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+
+        # used for deconv layers
+        self.deconv_layers = self._make_deconv_layer(
+            3,
+            [256, 128, 64],
+            [4, 4, 4],
+        )
+
+        for head in self.heads:
+            classes = self.heads[head]
+            if head_conv > 0:
+                fc = nn.Sequential(
+                  nn.Conv2d(64, head_conv,
+                    kernel_size=3, padding=1, bias=True),
+                  nn.ReLU(inplace=True),
+                  nn.Conv2d(head_conv, classes, 
+                    kernel_size=1, stride=1, 
+                    padding=0, bias=True))
+                if 'hm' in head:
+                    fc[-1].bias.data.fill_(-2.19)
+                else:
+                    fill_fc_weights(fc)
+            else:
+                fc = nn.Conv2d(64, classes, 
+                  kernel_size=1, stride=1, 
+                  padding=0, bias=True)
+                if 'hm' in head:
+                    fc.bias.data.fill_(-2.19)
+                else:
+                    fill_fc_weights(fc)
+            self.__setattr__(head, fc)
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _get_deconv_cfg(self, deconv_kernel, index):
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+
+        return deconv_kernel, padding, output_padding
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        assert num_layers == len(num_filters), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+        assert num_layers == len(num_kernels), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i], i)
+
+            planes = num_filters[i]
+            fc = DCN(self.inplanes, planes, 
+                    kernel_size=(3,3), stride=1,
+                    padding=1, dilation=1, deformable_groups=1)
+            # fc = nn.Conv2d(self.inplanes, planes,
+            #         kernel_size=3, stride=1, 
+            #         padding=1, dilation=1, bias=False)
+            # fill_fc_weights(fc)
+            up = nn.ConvTranspose2d(
+                    in_channels=planes,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=self.deconv_with_bias)
+            fill_up_weights(up)
+
+            layers.append(fc)
+            layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
+            layers.append(nn.ReLU(inplace=True))
+            layers.append(up)
+            layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
+            layers.append(nn.ReLU(inplace=True))
+            self.inplanes = planes
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.deconv_layers(x)
+        ret = {}
+        for head in self.heads:
+            ret[head] = self.__getattr__(head)(x)
+        return [ret]
+
+    def init_weights(self, num_layers):
+        if 1:
+            url = model_urls['resnet{}'.format(num_layers)]
+            pretrained_state_dict = model_zoo.load_url(url)
+            print('=> loading pretrained model {}'.format(url))
+            self.load_state_dict(pretrained_state_dict, strict=False)
+            print('=> init deconv weights from normal distribution')
+            for name, m in self.deconv_layers.named_modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+
+
+resnet_spec = {18: (BasicBlock, [2, 2, 2, 2]),
+               34: (BasicBlock, [3, 4, 6, 3]),
+               50: (Bottleneck, [3, 4, 6, 3]),
+               101: (Bottleneck, [3, 4, 23, 3]),
+               152: (Bottleneck, [3, 8, 36, 3])}
+
+
+def get_pose_net(num_layers, heads, head_conv=256):
+  block_class, layers = resnet_spec[num_layers]
+
+  model = PoseResNet(block_class, layers, heads, head_conv=head_conv)
+  model.init_weights(num_layers)
+  return model
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/scatter_gather.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/scatter_gather.py
new file mode 100644
index 00000000..9a460584
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/scatter_gather.py
@@ -0,0 +1,38 @@
+import torch
+from torch.autograd import Variable
+from torch.nn.parallel._functions import Scatter, Gather
+
+
+def scatter(inputs, target_gpus, dim=0, chunk_sizes=None):
+    r"""
+    Slices variables into approximately equal chunks and
+    distributes them across given GPUs. Duplicates
+    references to objects that are not variables. Does not
+    support Tensors.
+    """
+    def scatter_map(obj):
+        if isinstance(obj, Variable):
+            return Scatter.apply(target_gpus, chunk_sizes, dim, obj)
+        assert not torch.is_tensor(obj), "Tensors not supported in scatter."
+        if isinstance(obj, tuple):
+            return list(zip(*map(scatter_map, obj)))
+        if isinstance(obj, list):
+            return list(map(list, zip(*map(scatter_map, obj))))
+        if isinstance(obj, dict):
+            return list(map(type(obj), zip(*map(scatter_map, obj.items()))))
+        return [obj for targets in target_gpus]
+
+    return scatter_map(inputs)
+
+
+def scatter_kwargs(inputs, kwargs, target_gpus, dim=0, chunk_sizes=None):
+    r"""Scatter with support for kwargs dictionary"""
+    inputs = scatter(inputs, target_gpus, dim, chunk_sizes) if inputs else []
+    kwargs = scatter(kwargs, target_gpus, dim, chunk_sizes) if kwargs else []
+    if len(inputs) < len(kwargs):
+        inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
+    elif len(kwargs) < len(inputs):
+        kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
+    inputs = tuple(inputs)
+    kwargs = tuple(kwargs)
+    return inputs, kwargs
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/transformer.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/transformer.py
new file mode 100644
index 00000000..92e88e7e
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/transformer.py
@@ -0,0 +1,305 @@
+import torch
+import torch.nn as nn 
+import copy
+import math
+from torch.autograd import Variable
+import torch.nn.functional as F
+
+def get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+class Encoder(nn.Module):
+    def __init__(self, input_size, hidden_size, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.pe = PositionalEncoder(hidden_size, dropout=dropout)
+        self.layers = get_clones(EncoderLayer(hidden_size, heads, dropout), N)
+        self.norm = Norm(hidden_size)
+    def forward(self, x, mask = None, require_att = False):
+        #x = self.pe(x)
+        att = None
+        for i in range(self.N):
+            if mask is None:
+                if i == (self.N - 1):
+                    x, att = self.layers[i](x, require_att = True)
+                else:
+                    x = self.layers[i](x)
+            else:
+                x = self.layers[i](x, mask)
+        if require_att:
+            return x, att
+        else:
+            return x
+
+class Decoder(nn.Module):
+    def __init__(self, hidden_size, output_size):
+        super(Decoder, self).__init__()
+        self.linear = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size),
+            nn.ReLU(inplace=True),
+            nn.Linear(hidden_size, output_size),
+            nn.ReLU(inplace=True) #newly added
+        )
+
+    def forward(self, x):
+        out = self.linear(x)
+        return out
+
+class Transformer(nn.Module):
+    def __init__(self, input_size, hidden_size, output_size, n_layers, heads, dropout):
+        super().__init__()
+        self.linear = nn.Linear(input_size, hidden_size)
+        self.encoder = Encoder(input_size, hidden_size, n_layers, heads, dropout)
+        self.decoder = Decoder(hidden_size, output_size)
+    def forward(self, x, mask = None, require_att = False):
+        x = self.linear(x)
+        att = None
+        if mask is None:
+            #evaluation model
+            if require_att:
+                embedding, att = self.encoder(x, require_att = True)
+            else:
+                embedding = self.encoder(x)
+
+            output = self.decoder(embedding)
+
+            if require_att:
+                return output, att
+            else:
+                return output#, att
+        else:
+            if require_att :
+                embedding, att = self.encoder(x, mask, require_att = True)
+            else:
+                embedding = self.encoder(x, mask)
+                
+            output = self.decoder(embedding)
+            return output
+
+def get_model(opt, src_vocab, trg_vocab):
+    assert opt.d_model % opt.heads == 0
+    assert opt.dropout < 1
+    model = Transformer(src_vocab, trg_vocab, opt.d_model, opt.n_layers, opt.heads, opt.dropout)
+       
+    if opt.load_weights is not None:
+        print("loading pretrained weights...")
+        model.load_state_dict(torch.load(f'{opt.load_weights}/model_weights'))
+    else:
+        for p in model.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p) 
+    
+    if opt.device == 0:
+        model = model.cuda()
+    
+    return model
+
+# Sublayer
+class Norm(nn.Module):
+    def __init__(self, d_model, eps = 1e-6):
+        super().__init__()
+    
+        self.size = d_model 
+        # create two learnable parameters to calibrate normalisation
+        self.alpha = nn.Parameter(torch.ones(self.size))
+        self.bias = nn.Parameter(torch.zeros(self.size))  
+        self.eps = eps
+    
+    def forward(self, x):
+        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
+        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
+        return norm
+
+def attention(q, k, v, d_k, mask=None, dropout=None): 
+    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k) 
+
+    if mask is not None:
+        if len(mask.shape) == 2:
+            mask = mask.unsqueeze(1)
+            mask = mask.unsqueeze(3)
+            mask = mask.to(torch.float32)
+            mask2d = torch.matmul(mask, mask.transpose(-2, -1)).expand(scores.shape[0], scores.shape[1], scores.shape[2], scores.shape[3])
+        elif len(mask.shape) == 3:
+            mask = mask.unsqueeze(1)
+            mask = mask.to(torch.float32)
+            mask2d = mask.expand(scores.shape[0], scores.shape[1], scores.shape[2], scores.shape[3])
+
+
+        scores = scores.masked_fill(mask2d == 0, -1e9)
+   
+    scores = F.softmax(scores, dim=-1)  
+    if dropout is not None:
+        scores = dropout(scores)     
+    
+    output = torch.matmul(scores, v)
+    return output
+
+
+def attention_score(q, k, v, d_k): 
+    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)  
+    scores = F.softmax(scores, dim=-1)  
+    return scores
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, heads, d_model, dropout = 0.1):
+        super().__init__()
+        
+        self.d_model = d_model
+        self.d_k = d_model // heads
+        self.h = heads
+        
+        self.q_linear = nn.Linear(d_model, d_model)
+        self.v_linear = nn.Linear(d_model, d_model)
+        self.k_linear = nn.Linear(d_model, d_model)
+        
+        self.dropout = nn.Dropout(dropout)
+        self.out = nn.Linear(d_model, d_model)
+
+    def attention_map(self, q, k, v, mask=None):
+        bs = q.size(0)
+       
+        # perform linear operation and split into N heads
+        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
+        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
+        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
+        
+        # transpose to get dimensions bs * N * sl * d_model
+        k = k.transpose(1,2)
+        q = q.transpose(1,2)
+        v = v.transpose(1,2)
+
+        scores = attention_score(q, k, v, self.d_k)
+
+        return scores
+
+    
+    def forward(self, q, k, v, mask=None):
+        
+        bs = q.size(0)
+        
+        # perform linear operation and split into N heads
+        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
+        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
+        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
+        
+        # transpose to get dimensions bs * N * sl * d_model
+        k = k.transpose(1,2)
+        q = q.transpose(1,2)
+        v = v.transpose(1,2)
+
+        # calculate attention using function we will define next
+        scores = attention(q, k, v, self.d_k, mask, self.dropout)
+        # concatenate heads and put through final linear layer
+       
+        concat = scores.transpose(1,2).contiguous()\
+        .view(bs, -1, self.d_model)
+        output = self.out(concat)
+    
+        return output
+
+class FeedForward(nn.Module):
+    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
+        super().__init__() 
+    
+        # We set d_ff as a default to 2048
+        self.linear_1 = nn.Linear(d_model, d_ff)
+        self.dropout = nn.Dropout(dropout)
+        self.linear_2 = nn.Linear(d_ff, d_model)
+    
+    def forward(self, x):
+        x = self.dropout(F.relu(self.linear_1(x)))
+        x = self.linear_2(x)
+        return x
+
+#Embedding
+
+class Embedder(nn.Module):
+    def __init__(self, vocab_size, d_model):
+        super().__init__()
+        self.d_model = d_model
+        self.embed = nn.Embedding(vocab_size, d_model)
+    def forward(self, x):
+        return self.embed(x)
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, d_model, max_seq_len = 900, dropout = 0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.dropout = nn.Dropout(dropout)
+        # create constant 'pe' matrix with values dependant on 
+        # pos and i
+        pe = torch.zeros(max_seq_len, d_model)
+        for pos in range(max_seq_len):
+            for i in range(0, d_model, 2):
+                pe[pos, i] = \
+                math.sin(pos / (10000 ** ((2 * i)/d_model)))
+                pe[pos, i + 1] = \
+                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+ 
+    def forward(self, x):
+        # make embeddings relatively larger
+        x = x * math.sqrt(self.d_model)
+        #add constant to embedding
+        seq_len = x.size(1)
+        pe = Variable(self.pe[:,:seq_len], requires_grad=False)
+        if x.is_cuda:
+            pe.cuda()
+        x = x + pe
+        return self.dropout(x)
+
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        
+    def forward(self, x, mask = None, require_att = False):
+        x2 = self.norm_1(x)
+        xc = x2.clone()
+
+        if mask is None:
+            x = x + self.dropout_1(self.attn(x2,x2,x2))
+        else:
+            x = x + self.dropout_1(self.attn(x2,x2,x2, mask))
+
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.ff(x2))
+
+        if require_att :
+            att = self.attn.attention_map(xc, xc, xc)
+            return x, att
+        else :
+            return x
+    
+# build a decoder layer with two multi-head attention layers and
+# one feed-forward layer
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.norm_3 = Norm(d_model)
+        
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        self.dropout_3 = nn.Dropout(dropout)
+        
+        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+
+    def forward(self, x, e_outputs, src_mask, trg_mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \
+        src_mask))
+        x2 = self.norm_3(x)
+        x = x + self.dropout_3(self.ff(x2))
+        return x
\ No newline at end of file
diff --git a/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/utils.py b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/utils.py
new file mode 100644
index 00000000..df96a577
--- /dev/null
+++ b/qanything_kernel/utils/loader/pdf_to_markdown/core/layout/table_rec/lib/models/utils.py
@@ -0,0 +1,114 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import numpy
+import torch.nn as nn
+
+def _sigmoid(x):
+  y = torch.clamp(x.sigmoid_(), min=1e-4, max=1-1e-4)
+  return y
+
+def _h_dist_feat(output, width):
+  feat =  (output[:,:,0] + output[:,:,1])/(2*(width+1))
+  return feat
+
+def _make_pair_feat(output):
+  if len(output.shape) == 2:
+    output = output.unsqueeze(2)
+
+  output1 = output.unsqueeze(1).expand(output.size(0), output.size(1), output.size(1), output.size(2))
+  output2 = output.unsqueeze(2).expand(output.size(0), output.size(1), output.size(1), output.size(2))
+  output_paired = torch.cat((output1, output2), 3)
+
+  return output_paired
+
+def _v_dist_feat(output, height):
+  feat =  (output[:,:,2] + output[:,:,3])/(2*(height + 1))
+  return feat
+
+def _gather_feat(feat, ind, mask=None):
+  dim  = feat.size(2)
+  ind  = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
+  feat = feat.gather(1, ind)
+  if mask is not None:
+      mask = mask.unsqueeze(2).expand_as(feat)
+      feat = feat[mask]
+      feat = feat.view(-1, dim)
+  return feat
+
+def _flatten_and_gather_feat(output, ind):
+  dim = output.size(3)
+  ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
+  output = output.contiguous().view(output.size(0), -1, output.size(3))
+  output1 = output.gather(1, ind)
+
+  return output1
+
+def _get_4ps_feat(cc_match, output):
+  if isinstance(output, dict):
+    feat = output['cr']
+  else :
+    feat = output
+  device = feat.device
+  feat = feat.permute(0, 2, 3, 1).contiguous()
+  feat = feat.contiguous().view(feat.size(0), -1, feat.size(3))
+  feat = feat.unsqueeze(3).expand(feat.size(0), feat.size(1), feat.size(2), 4)
+
+  dim = feat.size(2)
+  cc_match = cc_match.unsqueeze(2).expand(cc_match.size(0), cc_match.size(1), dim, cc_match.size(2))
+  if not(isinstance(output, dict)):
+    cc_match = torch.where(cc_match<feat.shape[1], cc_match, (feat.shape[0]-1)* torch.ones(cc_match.shape).to(torch.int64).to(device))
+    cc_match = torch.where(cc_match>=0, cc_match, torch.zeros(cc_match.shape).to(torch.int64).to(device))
+  feat = feat.gather(1, cc_match)
+  return feat
+
+def _get_wh_feat(ind, output, ttype):
+ 
+  width = output['hm'].shape[2]
+  xs = (ind % width).unsqueeze(2).int().float()
+  ys = (ind // width).unsqueeze(2).int().float()
+  if ttype == 'gt':
+    wh = output['wh']
+  elif ttype == 'pred':
+    wh = _tranpose_and_gather_feat(output['wh'], ind)
+  ct = torch.cat([xs, ys, xs, ys, xs, ys, xs, ys], dim=2)
+  bbx = ct - wh
+
+  return bbx
+
+def _normalized_ps(ps, vocab_size):
+  device = ps.device
+  ps = torch.round(ps).to(torch.int64)
+  ps = torch.where(ps < vocab_size, ps, (vocab_size-1) * torch.ones(ps.shape).to(torch.int64).to(device))
+  ps = torch.where(ps >= 0, ps, torch.zeros(ps.shape).to(torch.int64).to(device))
+  return ps
+
+def _tranpose_and_gather_feat(feat, ind):
+  feat = feat.permute(0, 2, 3, 1).contiguous()
+  feat = feat.view(feat.size(0), -1, feat.size(3))
+  feat = _gather_feat(feat, ind)
+  return feat
+
+def flip_tensor(x):
+    return torch.flip(x, [3])
+
+def flip_lr(x, flip_idx):
+  tmp = x.detach().cpu().numpy()[..., ::-1].copy()
+  shape = tmp.shape
+  for e in flip_idx:
+    tmp[:, e[0], ...], tmp[:, e[1], ...] = \
+      tmp[:, e[1], ...].copy(), tmp[:, e[0], ...].copy()
+  return torch.from_numpy(tmp.reshape(shape)).to(x.device)
+
+def flip_lr_off(x, flip_idx):
+  tmp = x.detach().cpu().numpy()[..., ::-1].copy()
+  shape = tmp.shape
+  tmp = tmp.reshape(tmp.shape[0], 17, 2, 
+                    tmp.shape[2], tmp.shape[3])
+  tmp[:, :, 0, :, :] *= -1
+  for e in flip_idx:
+    tmp[:, e[0], ...], tmp[:, e[1], ...] = \
+      tmp[:, e[1], ...].copy(), tmp[:, e[0], ...].copy()
+  return torch.from_numpy(tmp.reshape(shape)).to(x.device)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index d41e1389..fa891564 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -29,4 +29,4 @@ faster_whisper==1.0.1
 python-dotenv==1.0.1
 duckduckgo-search==5.3.0b4
 html2text==2024.2.26
-mistune-3.0.2
+mistune==3.0.2