File size: 3,944 Bytes
b2fcbcc
57df620
 
a0ddd95
d147321
57df620
c3e06eb
57df620
 
c3e06eb
57df620
d41478b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d147321
 
 
57df620
d147321
 
 
 
 
57df620
d147321
 
 
57df620
d41478b
d147321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0ddd95
d147321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57df620
 
d147321
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""Upload embeddings to HuggingFace Space"""
import sys
from pathlib import Path
from huggingface_hub import HfApi
import sqlite3

# Add parent directory to path to locate 'config' module
sys.path.append(str(Path(__file__).parent.parent))

from config.settings import Settings

def verify_space_access():
    """Verify access to the Space and its settings"""
    api = HfApi(token=Settings.HF_TOKEN)
    
    try:
        # Check if we have write access
        space_info = api.space_info(Settings.HF_SPACE)
        print(f"Space runtime: {space_info.runtime}")
        print(f"Space sdk: {space_info.sdk}")
        
        # Try to list the contents to verify write access
        contents = api.list_repo_files(
            repo_id=Settings.HF_SPACE,
            repo_type="space"
        )
        print(f"Space contents: {len(contents)} files")
        return True
    except Exception as e:
        print(f"Error verifying space access: {str(e)}")
        return False

def verify_and_upload():
    """Verify local embeddings and upload to HuggingFace"""
    local_chroma = Settings.get_chroma_path()
    
    # Verify local files first
    print("\nVerifying local embeddings...")
    sqlite_file = local_chroma / "chroma.sqlite3"
    if not sqlite_file.exists():
        raise RuntimeError(f"SQLite database not found at {sqlite_file}")
    
    # Get collection info
    conn = sqlite3.connect(sqlite_file)
    cursor = conn.cursor()
    
    try:
        # Verify collection
        cursor.execute("SELECT name, id FROM collections;")
        collections = cursor.fetchall()
        
        if not collections:
            raise RuntimeError("No collections found in database")
            
        print("\nCollections in database:")
        for name, coll_id in collections:
            # Get embeddings count
            cursor.execute("""
                SELECT COUNT(*) 
                FROM embeddings e
                JOIN segments s ON e.segment_id = s.id
                WHERE s.collection = ?
            """, (coll_id,))
            count = cursor.fetchone()[0]
            print(f"Collection: {name}")
            print(f"ID: {coll_id}")
            print(f"Embeddings: {count}")
        
        # Verify expected collection exists
        collection_names = [c[0] for c in collections]
        if Settings.CHROMA_COLLECTION_NAME not in collection_names:
            raise RuntimeError(
                f"Expected collection '{Settings.CHROMA_COLLECTION_NAME}' not found. "
                f"Found: {collection_names}"
            )
        
        # Calculate total size
        total_size = sum(
            f.stat().st_size for f in local_chroma.glob('**/*') if f.is_file()
        )
        print(f"\nTotal size to upload: {total_size / (1024*1024):.2f} MB")
        
        # Confirm before upload
        response = input("\nReady to upload to HuggingFace. Continue? (y/N): ")
        if response.lower() != 'y':
            print("Upload cancelled.")
            return
        
        print("\nUploading to HuggingFace...")
        api = HfApi(token=Settings.HF_TOKEN)
        
        try:
            # Upload files to the existing repo
            api.upload_folder(
                folder_path=str(local_chroma),
                repo_id=Settings.HF_DATASET,
                repo_type="dataset",
                path_in_repo="chroma",
                ignore_patterns=["*.pyc", "__pycache__/", ".DS_Store"]
            )
            print("Upload complete!")
            
        except Exception as e:
            print(f"Error during upload: {e}")
            print("\nNOTE: If this was a storage limitation error:")
            print("1. Verify Git LFS is enabled")
            print("2. Check HuggingFace storage quota")
            print("3. Ensure all files are under size limits")
            raise
            
    finally:
        conn.close()

if __name__ == "__main__":
    verify_and_upload()