Skip to content

Commit

Permalink
Merge pull request #53 from kotaro-kinoshita/fix/apache-license
Browse files Browse the repository at this point in the history
ライセンス情報の記載
  • Loading branch information
kotaro-kinoshita authored Dec 5, 2024
2 parents b4e6830 + 5f5fba0 commit 3cbe59b
Show file tree
Hide file tree
Showing 5 changed files with 155 additions and 36 deletions.
13 changes: 13 additions & 0 deletions src/yomitoku/models/layers/activate.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# Copyright(c) 2023 lyuwenyu
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch.nn as nn


Expand Down
34 changes: 28 additions & 6 deletions src/yomitoku/models/layers/rtdetr_backbone.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
"""Copyright(c) 2023 lyuwenyu. All Rights Reserved."""
# Copyright 2023 lyuwenyu
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import OrderedDict

Expand Down Expand Up @@ -47,7 +59,9 @@ def forward(self, x):
class BasicBlock(nn.Module):
expansion = 1

def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"):
def __init__(
self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"
):
super().__init__()

self.shortcut = shortcut
Expand Down Expand Up @@ -86,7 +100,9 @@ def forward(self, x):
class BottleNeck(nn.Module):
expansion = 4

def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"):
def __init__(
self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"
):
super().__init__()

if variant == "a":
Expand All @@ -109,13 +125,17 @@ def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"):
("pool", nn.AvgPool2d(2, 2, 0, ceil_mode=True)),
(
"conv",
ConvNormLayer(ch_in, ch_out * self.expansion, 1, 1),
ConvNormLayer(
ch_in, ch_out * self.expansion, 1, 1
),
),
]
)
)
else:
self.short = ConvNormLayer(ch_in, ch_out * self.expansion, 1, stride)
self.short = ConvNormLayer(
ch_in, ch_out * self.expansion, 1, stride
)

self.act = nn.Identity() if act is None else get_activation(act)

Expand All @@ -136,7 +156,9 @@ def forward(self, x):


class Blocks(nn.Module):
def __init__(self, block, ch_in, ch_out, count, stage_num, act="relu", variant="b"):
def __init__(
self, block, ch_in, ch_out, count, stage_num, act="relu", variant="b"
):
super().__init__()

self.blocks = nn.ModuleList()
Expand Down
38 changes: 31 additions & 7 deletions src/yomitoku/models/layers/rtdetr_hybrid_encoder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
"""Copyright(c) 2023 lyuwenyu. All Rights Reserved."""
# Copyright 2023 lyuwenyu
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
from collections import OrderedDict
Expand Down Expand Up @@ -240,7 +252,9 @@ def __init__(
for in_channel in in_channels:
if version == "v1":
proj = nn.Sequential(
nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False),
nn.Conv2d(
in_channel, hidden_dim, kernel_size=1, bias=False
),
nn.BatchNorm2d(hidden_dim),
)
elif version == "v2":
Expand Down Expand Up @@ -276,7 +290,9 @@ def __init__(

self.encoder = nn.ModuleList(
[
TransformerEncoder(copy.deepcopy(encoder_layer), num_encoder_layers)
TransformerEncoder(
copy.deepcopy(encoder_layer), num_encoder_layers
)
for _ in range(len(use_encoder_idx))
]
)
Expand Down Expand Up @@ -331,7 +347,9 @@ def _reset_parameters(self):
# self.register_buffer(f'pos_embed{idx}', pos_embed)

@staticmethod
def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0):
def build_2d_sincos_position_embedding(
w, h, embed_dim=256, temperature=10000.0
):
""" """
grid_w = torch.arange(int(w), dtype=torch.float32)
grid_h = torch.arange(int(h), dtype=torch.float32)
Expand Down Expand Up @@ -369,7 +387,9 @@ def forward(self, feats):
src_flatten.device
)

memory: torch.Tensor = self.encoder[i](src_flatten, pos_embed=pos_embed)
memory: torch.Tensor = self.encoder[i](
src_flatten, pos_embed=pos_embed
)
proj_feats[enc_ind] = (
memory.permute(0, 2, 1)
.reshape(-1, self.hidden_dim, h, w)
Expand All @@ -381,9 +401,13 @@ def forward(self, feats):
for idx in range(len(self.in_channels) - 1, 0, -1):
feat_heigh = inner_outs[0]
feat_low = proj_feats[idx - 1]
feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_heigh)
feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
feat_heigh
)
inner_outs[0] = feat_heigh
upsample_feat = F.interpolate(feat_heigh, scale_factor=2.0, mode="nearest")
upsample_feat = F.interpolate(
feat_heigh, scale_factor=2.0, mode="nearest"
)
inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
torch.concat([upsample_feat, feat_low], dim=1)
)
Expand Down
74 changes: 56 additions & 18 deletions src/yomitoku/models/layers/rtdetrv2_decoder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,17 @@
"""Copyright(c) 2023 lyuwenyu. All Rights Reserved."""
# Scene Text Recognition Model Hub
# Copyright 2023 lyuwenyu
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import functools
Expand Down Expand Up @@ -27,7 +40,9 @@ def inverse_sigmoid(x: torch.Tensor, eps: float = 1e-5) -> torch.Tensor:


class MLP(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim, num_layers, act="relu"):
def __init__(
self, input_dim, hidden_dim, output_dim, num_layers, act="relu"
):
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
Expand Down Expand Up @@ -178,7 +193,9 @@ def forward(
elif reference_points.shape[-1] == 4:
# reference_points [8, 480, None, 1, 4]
# sampling_offsets [8, 480, 8, 12, 2]
num_points_scale = self.num_points_scale.to(dtype=query.dtype).unsqueeze(-1)
num_points_scale = self.num_points_scale.to(
dtype=query.dtype
).unsqueeze(-1)
offset = (
sampling_offsets
* num_points_scale
Expand Down Expand Up @@ -313,7 +330,9 @@ def deformable_attention_core_func_v2(
_, Len_q, _, _, _ = sampling_locations.shape

split_shape = [h * w for h, w in value_spatial_shapes]
value_list = value.permute(0, 2, 3, 1).flatten(0, 1).split(split_shape, dim=-1)
value_list = (
value.permute(0, 2, 3, 1).flatten(0, 1).split(split_shape, dim=-1)
)

# sampling_offsets [8, 480, 8, 12, 2]
if method == "default":
Expand Down Expand Up @@ -342,7 +361,8 @@ def deformable_attention_core_func_v2(
elif method == "discrete":
# n * m, seq, n, 2
sampling_coord = (
sampling_grid_l * torch.tensor([[w, h]], device=value.device) + 0.5
sampling_grid_l * torch.tensor([[w, h]], device=value.device)
+ 0.5
).to(torch.int64)

# FIX ME? for rectangle input
Expand All @@ -369,7 +389,9 @@ def deformable_attention_core_func_v2(
attn_weights = attention_weights.permute(0, 2, 1, 3).reshape(
bs * n_head, 1, Len_q, sum(num_points_list)
)
weighted_sample_locs = torch.concat(sampling_value_list, dim=-1) * attn_weights
weighted_sample_locs = (
torch.concat(sampling_value_list, dim=-1) * attn_weights
)
output = weighted_sample_locs.sum(-1).reshape(bs, n_head * c, Len_q)

return output.permute(0, 2, 1)
Expand Down Expand Up @@ -584,7 +606,9 @@ def _build_input_proj_layer(self, feat_channels):
[
(
"conv",
nn.Conv2d(in_channels, self.hidden_dim, 1, bias=False),
nn.Conv2d(
in_channels, self.hidden_dim, 1, bias=False
),
),
(
"norm",
Expand Down Expand Up @@ -665,9 +689,13 @@ def _generate_anchors(
torch.arange(h), torch.arange(w), indexing="ij"
)
grid_xy = torch.stack([grid_x, grid_y], dim=-1)
grid_xy = (grid_xy.unsqueeze(0) + 0.5) / torch.tensor([w, h], dtype=dtype)
grid_xy = (grid_xy.unsqueeze(0) + 0.5) / torch.tensor(
[w, h], dtype=dtype
)
wh = torch.ones_like(grid_xy) * grid_size * (2.0**lvl)
lvl_anchors = torch.concat([grid_xy, wh], dim=-1).reshape(-1, h * w, 4)
lvl_anchors = torch.concat([grid_xy, wh], dim=-1).reshape(
-1, h * w, 4
)
anchors.append(lvl_anchors)

anchors = torch.concat(anchors, dim=1).to(device)
Expand Down Expand Up @@ -701,18 +729,22 @@ def _get_decoder_input(
)

enc_topk_bboxes_list, enc_topk_logits_list = [], []
enc_topk_memory, enc_topk_logits, enc_topk_bbox_unact = self._select_topk(
output_memory,
enc_outputs_logits,
enc_outputs_coord_unact,
self.num_queries,
enc_topk_memory, enc_topk_logits, enc_topk_bbox_unact = (
self._select_topk(
output_memory,
enc_outputs_logits,
enc_outputs_coord_unact,
self.num_queries,
)
)

# if self.num_select_queries != self.num_queries:
# raise NotImplementedError('')

if self.learn_query_content:
content = self.tgt_embed.weight.unsqueeze(0).tile([memory.shape[0], 1, 1])
content = self.tgt_embed.weight.unsqueeze(0).tile(
[memory.shape[0], 1, 1]
)
else:
content = enc_topk_memory.detach()

Expand All @@ -739,7 +771,9 @@ def _select_topk(
topk: int,
):
if self.query_select_method == "default":
_, topk_ind = torch.topk(outputs_logits.max(-1).values, topk, dim=-1)
_, topk_ind = torch.topk(
outputs_logits.max(-1).values, topk, dim=-1
)

elif self.query_select_method == "one2many":
_, topk_ind = torch.topk(outputs_logits.flatten(1), topk, dim=-1)
Expand All @@ -752,12 +786,16 @@ def _select_topk(

topk_coords = outputs_coords_unact.gather(
dim=1,
index=topk_ind.unsqueeze(-1).repeat(1, 1, outputs_coords_unact.shape[-1]),
index=topk_ind.unsqueeze(-1).repeat(
1, 1, outputs_coords_unact.shape[-1]
),
)

topk_logits = outputs_logits.gather(
dim=1,
index=topk_ind.unsqueeze(-1).repeat(1, 1, outputs_logits.shape[-1]),
index=topk_ind.unsqueeze(-1).repeat(
1, 1, outputs_logits.shape[-1]
),
)

topk_memory = memory.gather(
Expand Down
32 changes: 27 additions & 5 deletions src/yomitoku/postprocessor/rtdetr_postprocessor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,17 @@
"""Copyright(c) 2023 lyuwenyu. All Rights Reserved."""
# Copyright 2023 lyuwenyu
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import torch
import torch.nn as nn
Expand Down Expand Up @@ -41,12 +54,16 @@ def forward(self, outputs, orig_target_sizes: torch.Tensor, threshold):
logits, boxes = outputs["pred_logits"], outputs["pred_boxes"]
# orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)

bbox_pred = torchvision.ops.box_convert(boxes, in_fmt="cxcywh", out_fmt="xyxy")
bbox_pred = torchvision.ops.box_convert(
boxes, in_fmt="cxcywh", out_fmt="xyxy"
)
bbox_pred *= orig_target_sizes.repeat(1, 2).unsqueeze(1)

if self.use_focal_loss:
scores = F.sigmoid(logits)
scores, index = torch.topk(scores.flatten(1), self.num_top_queries, dim=-1)
scores, index = torch.topk(
scores.flatten(1), self.num_top_queries, dim=-1
)
# TODO for older tensorrt
# labels = index % self.num_classes
labels = mod(index, self.num_classes)
Expand All @@ -60,7 +77,9 @@ def forward(self, outputs, orig_target_sizes: torch.Tensor, threshold):
scores = F.softmax(logits)[:, :, :-1]
scores, labels = scores.max(dim=-1)
if scores.shape[1] > self.num_top_queries:
scores, index = torch.topk(scores, self.num_top_queries, dim=-1)
scores, index = torch.topk(
scores, self.num_top_queries, dim=-1
)
labels = torch.gather(labels, dim=1, index=index)
boxes = torch.gather(
boxes,
Expand All @@ -78,7 +97,10 @@ def forward(self, outputs, orig_target_sizes: torch.Tensor, threshold):

labels = (
torch.tensor(
[mscoco_label2category[int(x.item())] for x in labels.flatten()]
[
mscoco_label2category[int(x.item())]
for x in labels.flatten()
]
)
.to(boxes.device)
.reshape(labels.shape)
Expand Down

0 comments on commit 3cbe59b

Please sign in to comment.