wip

turingmotors · Aug 24, 2023 · eea2a12 · eea2a12
1 parent 8f1d9d5
commit eea2a12
Show file tree

Hide file tree

Showing 39 changed files with 6,256 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,9 @@
+__pycache__
+
+.ipynb_checkpoints
+
+output/*
+!output/.gitkeep
+
+data/*
+!data/.gitkeep
diff --git a/README.md b/README.md
@@ -0,0 +1,97 @@
+# Heron - A Library for Vision/Video and Language models
+
+<a href='https://huggingface.co/Inoichan/GIT-Llama-2-7B'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue'></a> 
+
+<img src="./images/heron_image.png">
+
+Welcome to "heron" repository. Heron is a library that seamlessly integrates multiple Vision and Language models, as well as Video and Language models. One of its standout features is its support for Japanese V&L models. Additionally, we provide pretrained weights trained on various datasets.
+
+
+# Installation
+1. Clone this repository
+```bash
+git clone https://github.com/turingmotors/heron
+cd heron
+```
+
+2. Install Packages
+```bash
+conda create -n git_llm python=3.10 -y
+conda activate git_llm
+pip install --upgrade pip  # enable PEP 660 support
+
+pip install -r requirements.txt
+pip install -e .
+```
+
+## For Llama 2
+First, you request access to the llama-2 models, in [huggingface page](https://huggingface.co/meta-llama/Llama-2-7b) and [facebook website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/)
+
+Please sign-in the huggingface account
+```bash
+huggingface-cli login
+```
+
+# Training
+
+Now we support LLaMA, MPT, and OPT as a LLM module.
+
+```bash
+./scripts/run.sh
+```
+
+# Evaluation
+
+You can get the pretrained weight form HuggingFace Hub: [Inoichan/GIT-Llama-2-7B](https://huggingface.co/Inoichan/GIT-Llama-2-7B)<br>
+See also [notebooks](./notebooks).
+
+```python
+import requests
+from transformers import AutoProcessor
+from git_llm.git_llama import GitLlamaForCausalLM
+
+device_id = 0
+
+# prepare a pretrained model
+model = GitLlamaForCausalLM.from_pretrained('Inoichan/GIT-Llama-2-7B')
+model.eval()
+model.to(f"cuda:{device_id}")
+
+# prepare a processor
+processor = AutoProcessor.from_pretrained('Inoichan/GIT-Llama-2-7B')
+
+# prepare inputs
+url = "https://www.barnorama.com/wp-content/uploads/2016/12/03-Confusing-Pictures.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+text = f"##Instruction: Please answer the following question concletely. ##Question: What is unusual about this image? Explain precisely and concletely what he is doing? ##Answer: "
+
+# do preprocessing
+inputs = processor(
+    text,
+    image,
+    return_tensors="pt",
+    truncation=True,
+)
+inputs = {k: v.to(f"cuda:{device_id}") for k, v in inputs.items()}
+
+# set eos token
+eos_token_id_list = [
+    processor.tokenizer.pad_token_id,
+    processor.tokenizer.eos_token_id,
+]
+
+# do inference
+with torch.no_grad():
+    out = model.generate(**inputs, max_length=256, do_sample=False, temperature=0., eos_token_id=eos_token_id_list)
+
+# print result
+print(processor.tokenizer.batch_decode(out))
+```
+
+# Acknoledge
+
+- [GenerativeImage2Text](https://github.com/microsoft/GenerativeImage2Text): The main idia of the model is based on original GIT.
+- [Llava](https://github.com/haotian-liu/LLaVA): This project is learned a lot from the great Llava project.
+- [GIT-LLM](https://github.com/Ino-Ichan/GIT-LLM)
+- [video_blip](https://github.com/kotarotanahashi/video_blip)
diff --git a/configs/datasets/m3it.yaml b/configs/datasets/m3it.yaml
@@ -0,0 +1,3 @@
+dataset_type: coco
+
+tarain_val: true
diff --git a/configs/deepspeed/ds_config_zero1.json b/configs/deepspeed/ds_config_zero1.json
@@ -0,0 +1,48 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 1,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": 1e7,
+        "stage3_prefetch_bucket_size": 1e7,
+        "stage3_param_persistence_threshold": 10240,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/configs/deepspeed/ds_config_zero2.json b/configs/deepspeed/ds_config_zero2.json
@@ -0,0 +1,48 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": 1e7,
+        "stage3_prefetch_bucket_size": 1e7,
+        "stage3_param_persistence_threshold": 10240,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/configs/deepspeed/ds_config_zero3.json b/configs/deepspeed/ds_config_zero3.json
@@ -0,0 +1,56 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": 1e7,
+        "stage3_prefetch_bucket_size": 1e7,
+        "stage3_param_persistence_threshold": 10240,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/configs/llama/training_config_exp050_llama.yml b/configs/llama/training_config_exp050_llama.yml
@@ -0,0 +1,64 @@
+training:
+  per_device_train_batch_size: 2
+  gradient_accumulation_steps: 4
+  num_train_epochs: 1
+  dataloader_num_workers: 16
+  fp16: true
+  optim: "adamw_torch"
+  learning_rate: 5.0e-5
+  logging_steps: 100
+  evaluation_strategy: "steps"
+  save_strategy: "steps"
+  eval_steps: 4000
+  save_steps: 4000
+  save_total_limit: 1
+  deepspeed: configs/ds_config_zero1.json
+  output_dir: ./output/
+  report_to: "wandb"
+
+settings:
+  model_name: meta-llama/Llama-2-7b-chat-hf
+  vision_model_name: openai/clip-vit-base-patch16
+  num_image_with_embedding: # None or video sequence num
+  max_length: 512
+  keys_finetune:
+    - visual_projection
+    - num_image_with_embedding
+
+use_lora: true
+lora:
+  r: 8
+  lora_alpha: 32
+  target_modules:
+    - q_proj
+    - v_proj
+  lora_dropout: 0.01
+  bias: none
+  task_type: CAUSAL_LM
+
+dataset_type: path/to/config
+  - coco
+  - textcap
+  - image-paragraph-captioning
+  - coco-goi
+  - coco-text
+  - imagenet
+  - coco-itm
+  - snli-ve
+  - mocheg
+  - iqa
+  - vqa-v2
+  - shapes
+  - docvqa
+  - ocr-vqa
+  - st-vqa
+  - text-vqa
+  - gqa
+  - okvqa
+  - a-okvqa
+  - viquae
+  - clevr
+  - vcr
+  - visual-mrc
+  - visual-dialog
+  - multi30k
diff --git a/heron/__init__.py b/heron/__init__.py
diff --git a/heron/datasets/README.md b/heron/datasets/README.md
@@ -0,0 +1,17 @@
+# Datasets Description
+
+# Supported Datasets
+
+## English
+- [M3IT](https://huggingface.co/datasets/MMInstruction/M3IT)
+
+## Japanese
+- [STAIR](http://captions.stair.center/)
+- [Japanese Visual Genome VQA dataset](https://github.com/yahoojapan/ja-vg-vqa)
+
+### Preparing CSV files for Japanese STAIR/Visual Genome
+
+Download [data](../../data/) at data directory.<br>
+For using Japanese dataset, please generate preprocessed csv files. See notebooks in [preprocess](./preprocess/).
+
+
diff --git a/heron/datasets/__init__.py b/heron/datasets/__init__.py
diff --git a/heron/datasets/base_datasets.py b/heron/datasets/base_datasets.py
@@ -0,0 +1,30 @@
+import abc
+
+from torch.utils.data import Dataset
+
+
+class BaseDataset(Dataset):
+
+    def __init__(self, is_inference: bool = False):
+        super(BaseDataset, self).__init__()
+        self.is_inference = is_inference
+
+    @abc.abstractmethod
+    @classmethod
+    def create(cls, *args, **kwargs):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def __getitem__(self, index):
+        if self.is_inference:
+            return self._get_item_inference(index)
+        else:
+            return self._get_item_train(index)
+
+    @abc.abstractmethod
+    def _get_item_train(self, index):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def _get_item_inference(self, index):
+        raise NotImplementedError