"""Upload embeddings to HuggingFace Space""" import sys from pathlib import Path from huggingface_hub import HfApi import sqlite3 # Add parent directory to path to locate 'config' module sys.path.append(str(Path(__file__).parent.parent)) from config.settings import Settings def verify_space_access(): """Verify access to the Space and its settings""" api = HfApi(token=Settings.HF_TOKEN) try: # Check if we have write access space_info = api.space_info(Settings.HF_SPACE) print(f"Space runtime: {space_info.runtime}") print(f"Space sdk: {space_info.sdk}") # Try to list the contents to verify write access contents = api.list_repo_files( repo_id=Settings.HF_SPACE, repo_type="space" ) print(f"Space contents: {len(contents)} files") return True except Exception as e: print(f"Error verifying space access: {str(e)}") return False def verify_and_upload(): """Verify local embeddings and upload to HuggingFace""" local_chroma = Settings.get_chroma_path() # Verify local files first print("\nVerifying local embeddings...") sqlite_file = local_chroma / "chroma.sqlite3" if not sqlite_file.exists(): raise RuntimeError(f"SQLite database not found at {sqlite_file}") # Get collection info conn = sqlite3.connect(sqlite_file) cursor = conn.cursor() try: # Verify collection cursor.execute("SELECT name, id FROM collections;") collections = cursor.fetchall() if not collections: raise RuntimeError("No collections found in database") print("\nCollections in database:") for name, coll_id in collections: # Get embeddings count cursor.execute(""" SELECT COUNT(*) FROM embeddings e JOIN segments s ON e.segment_id = s.id WHERE s.collection = ? """, (coll_id,)) count = cursor.fetchone()[0] print(f"Collection: {name}") print(f"ID: {coll_id}") print(f"Embeddings: {count}") # Verify expected collection exists collection_names = [c[0] for c in collections] if Settings.CHROMA_COLLECTION_NAME not in collection_names: raise RuntimeError( f"Expected collection '{Settings.CHROMA_COLLECTION_NAME}' not found. " f"Found: {collection_names}" ) # Calculate total size total_size = sum( f.stat().st_size for f in local_chroma.glob('**/*') if f.is_file() ) print(f"\nTotal size to upload: {total_size / (1024*1024):.2f} MB") # Confirm before upload response = input("\nReady to upload to HuggingFace. Continue? (y/N): ") if response.lower() != 'y': print("Upload cancelled.") return print("\nUploading to HuggingFace...") api = HfApi(token=Settings.HF_TOKEN) try: # Upload files to the existing repo api.upload_folder( folder_path=str(local_chroma), repo_id=Settings.HF_DATASET, repo_type="dataset", path_in_repo="chroma", ignore_patterns=["*.pyc", "__pycache__/", ".DS_Store"] ) print("Upload complete!") except Exception as e: print(f"Error during upload: {e}") print("\nNOTE: If this was a storage limitation error:") print("1. Verify Git LFS is enabled") print("2. Check HuggingFace storage quota") print("3. Ensure all files are under size limits") raise finally: conn.close() if __name__ == "__main__": verify_and_upload()