Vision training playground

2025-01-16 16:19:05 -08:00 · 2025-01-16 16:19:05 -08:00 · 247477edc8
commit 247477edc8
parent 07920c88ec
7 changed files with 8047 additions and 15869 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,6 +3,7 @@ config.py

 # Unsloth
 _unsloth_sentencepiece_temp/
+unsloth_compiled_cache/

 # ---> Python
 # Byte-compiled / optimized / DLL files
--- a/data/booru/.gitignore
+++ b/data/booru/.gitignore
@ -0,0 +1,3 @@
+*
+!README.md
+!.gitignore
--- a/data/booru/README.md
+++ b/data/booru/README.md
@ -0,0 +1 @@
+Place booru images here, with filenames of the form "12345 - Tag1 Tag_2 Tag3.jpg"
--- a/data/package-lock.json
+++ b/data/package-lock.json
@ -1,5 +1,5 @@
 {
-  "name": "discord",
+  "name": "data",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {
--- a/data/proc_booru.py
+++ b/data/proc_booru.py
@ -0,0 +1,177 @@
+"""
+proc_booru.py
+This script assumes you have a folder called 'booru/' in the current directory,
+containing a bunch of images following the Shimmie Booru naming scheme, i.e.
+'12345 - Tag1 Tag2 Tag3.jpg'.
+"""
+
+import json
+import os
+from pathlib import Path
+from typing import List, Tuple
+import re
+from unsloth import FastVisionModel
+import pandas as pd
+import numpy as np
+import PIL
+import torch
+import io
+import tqdm
+
+# names of real-life people tagged in images
+NAMES = set(["James", "Vincent", "Myles", "Sam", "Jake", "Nicolai", "David", "ren", "Nazar"])
+# irrelevant tags that should just be removed
+IRRELEVANT = set(["_", "Myles'", "Vinny's", "Jake's", "tagme", "Nguyen"])
+
+def parse_filename(filename: str) -> Tuple[str, List[str]]:
+    """
+    Parse a filename of format '12345 - Tag1 Tag2 Tag3.jpg' into ID and tags.
+    Returns tuple of (id, [tags])
+    """
+    # Remove file extension
+    name = os.path.splitext(filename)[0]
+    
+    # Split into ID and tags
+    match = re.match(r'(\d+)\s*-\s*(.*)', name)
+    if not match:
+        raise ValueError(f"Invalid filename format: {filename}")
+    
+    image_id = match.group(1)
+    tags = match.group(2).strip().split()
+    
+    # remove irrelevant tags
+    irrelevant_overlap = IRRELEVANT.intersection(tags)
+    if len(irrelevant_overlap) > 0:
+        for tag in irrelevant_overlap:
+            tags.remove(tag)
+
+    # remove ambiguous situations with people's names, since the model won't know what they look like
+    names_overlap = NAMES.intersection(tags)
+    if len(names_overlap) > 1:
+        for name in names_overlap:
+            tags.remove(name)
+    
+    return image_id, tags
+
+def create_prompt(tags: List[str]) -> str:
+    """
+    Create a prompt for the LLM to generate a summary based on tags.
+    """
+    tags_str = ', '.join(tags)
+    return [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "You are a helpful assistant. You must write a caption describing the following image, given a list of tags describing the image. Your response must contain absolutely nothing apart from a caption. Keep it as concise as you possibly can, at a hard maximum of two sentences. Avoid describing any small details, simply focus on the main subject of the image. Responses that are simply a repeat of the input are strictly forbidden. Your responses should be said with certainty.\n\nExample:\n```\nTags: 1991_Honda_Civic, Cisco_Parking_Lot, Grayscale, Milpitas, UnionPay\nThe image depicts a black and white photograph of a 1991 Honda Civic sedan parked in a Cisco parking lot in Milpitas, with a partial UnionPay advertisement visible.\n```\n\nExample:\n```\nTags: 2015_Honda_CB300F, Encinal_Canyon_Road, Malibu\nThe image features a 2015 Honda CB300F motorcycle parked on the side of Encinal Canyon Road in Malibu.\n```"
+                },
+                {"type": "image"},
+                {
+                    "type": "text",
+                    "text": f"Tags: {tags_str}"
+                },
+            ]
+        }
+    ]
+
+def load_image_as_bytes(image_path: Path) -> bytes:
+    """
+    Load an image file and return it as bytes.
+    """
+    with PIL.Image.open(image_path) as img:
+        # Convert to RGB if necessary
+        if img.mode != 'RGB':
+            img = img.convert('RGB')
+        
+        # Save to bytes
+        img_byte_arr = io.BytesIO()
+        img.save(img_byte_arr, format='JPEG')
+        return img_byte_arr.getvalue()
+
+def main():
+    model, tokenizer = FastVisionModel.from_pretrained(
+        "unsloth/Llama-3.2-11B-Vision-Instruct",
+        load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
+        use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
+    )
+    FastVisionModel.for_inference(model)
+    
+    # Process all images in the booru directory
+    booru_dir = Path('booru')
+    if not booru_dir.exists():
+        raise FileNotFoundError("booru directory not found")
+    
+    # Create lists to store data
+    data = []
+
+    # Get all image files
+    image_files = [f for f in os.listdir(booru_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
+
+    # Process each file
+    for filename in tqdm.tqdm(image_files):
+        try:
+            filepath = booru_dir / filename
+            
+            # Parse filename
+            image_id, tags = parse_filename(filename)
+            
+            # Create prompt
+            prompt = create_prompt(tags)
+            input_text = tokenizer.apply_chat_template(prompt, add_generation_prompt=True)
+            image = PIL.Image.open(filepath)
+            inputs = tokenizer(
+                image,
+                input_text,
+                add_special_tokens = False,
+                return_tensors = "pt",
+            ).to("cuda")
+
+            # Generate summary using VLLM
+            outputs = model.generate(**inputs, max_new_tokens=128,
+                   use_cache=True, temperature=1.5, min_p=0.1)
+            generated_text = tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
+            generated_text = generated_text.partition('\n')[0]
+
+            # Load image as bytes
+            image_bytes = load_image_as_bytes(filepath)
+            
+            data_dict = {
+                'image_id': image_id,
+                'filename': filename,
+                'tags': tags,
+                'tags_string': ' '.join(tags),
+                'summary': generated_text,
+            }
+
+            print(data_dict)
+
+            data_dict['image_data'] = image_bytes
+
+            # Store data
+            data.append(data_dict)
+            
+            # Print progress
+            print(f"Processed: {filename}")
+            
+        except ValueError as e:
+            print(f"Error processing {filename}: {e}")
+        except Exception as e:
+            print(f"Unexpected error processing {filename}: {e}")
+    
+    # Convert to DataFrame
+    df = pd.DataFrame(data)
+    
+    # Save to Parquet
+    output_path = 'image_summaries.parquet'
+    df.to_parquet(output_path, compression='snappy')
+    print(f"\nSaved dataset to {output_path}")
+    
+    # Print summary statistics
+    print(f"\nDataset Summary:")
+    print(f"Total images processed: {len(df)}")
+    print(f"Unique tags: {len(set(' '.join(df['tags_string']).split()))}")
+    print(f"Average summary length: {df['summary'].str.len().mean():.1f} characters")
+
+if __name__ == "__main__":
+    main()
--- a/mikuai-vision-training-notebook.ipynb
+++ b/mikuai-vision-training-notebook.ipynb
--- a/train_unsloth.ipynb
+++ b/train_unsloth.ipynb
				`@ -0,0 +1 @@`
				`Place booru images here, with filenames of the form "12345 - Tag1 Tag_2 Tag3.jpg"`