James Edmunds
Checkpoint! local working probably. new embeddings, added to readme, additional scripts, updated process_lyrics and upload_embeddings and added some testscripts.
d147321 | """Upload embeddings to HuggingFace Space""" | |
| import sys | |
| from pathlib import Path | |
| from huggingface_hub import HfApi | |
| import sqlite3 | |
| # Add parent directory to path to locate 'config' module | |
| sys.path.append(str(Path(__file__).parent.parent)) | |
| from config.settings import Settings | |
| def verify_space_access(): | |
| """Verify access to the Space and its settings""" | |
| api = HfApi(token=Settings.HF_TOKEN) | |
| try: | |
| # Check if we have write access | |
| space_info = api.space_info(Settings.HF_SPACE) | |
| print(f"Space runtime: {space_info.runtime}") | |
| print(f"Space sdk: {space_info.sdk}") | |
| # Try to list the contents to verify write access | |
| contents = api.list_repo_files( | |
| repo_id=Settings.HF_SPACE, | |
| repo_type="space" | |
| ) | |
| print(f"Space contents: {len(contents)} files") | |
| return True | |
| except Exception as e: | |
| print(f"Error verifying space access: {str(e)}") | |
| return False | |
| def verify_and_upload(): | |
| """Verify local embeddings and upload to HuggingFace""" | |
| local_chroma = Settings.get_chroma_path() | |
| # Verify local files first | |
| print("\nVerifying local embeddings...") | |
| sqlite_file = local_chroma / "chroma.sqlite3" | |
| if not sqlite_file.exists(): | |
| raise RuntimeError(f"SQLite database not found at {sqlite_file}") | |
| # Get collection info | |
| conn = sqlite3.connect(sqlite_file) | |
| cursor = conn.cursor() | |
| try: | |
| # Verify collection | |
| cursor.execute("SELECT name, id FROM collections;") | |
| collections = cursor.fetchall() | |
| if not collections: | |
| raise RuntimeError("No collections found in database") | |
| print("\nCollections in database:") | |
| for name, coll_id in collections: | |
| # Get embeddings count | |
| cursor.execute(""" | |
| SELECT COUNT(*) | |
| FROM embeddings e | |
| JOIN segments s ON e.segment_id = s.id | |
| WHERE s.collection = ? | |
| """, (coll_id,)) | |
| count = cursor.fetchone()[0] | |
| print(f"Collection: {name}") | |
| print(f"ID: {coll_id}") | |
| print(f"Embeddings: {count}") | |
| # Verify expected collection exists | |
| collection_names = [c[0] for c in collections] | |
| if Settings.CHROMA_COLLECTION_NAME not in collection_names: | |
| raise RuntimeError( | |
| f"Expected collection '{Settings.CHROMA_COLLECTION_NAME}' not found. " | |
| f"Found: {collection_names}" | |
| ) | |
| # Calculate total size | |
| total_size = sum( | |
| f.stat().st_size for f in local_chroma.glob('**/*') if f.is_file() | |
| ) | |
| print(f"\nTotal size to upload: {total_size / (1024*1024):.2f} MB") | |
| # Confirm before upload | |
| response = input("\nReady to upload to HuggingFace. Continue? (y/N): ") | |
| if response.lower() != 'y': | |
| print("Upload cancelled.") | |
| return | |
| print("\nUploading to HuggingFace...") | |
| api = HfApi(token=Settings.HF_TOKEN) | |
| try: | |
| # Upload files to the existing repo | |
| api.upload_folder( | |
| folder_path=str(local_chroma), | |
| repo_id=Settings.HF_DATASET, | |
| repo_type="dataset", | |
| path_in_repo="chroma", | |
| ignore_patterns=["*.pyc", "__pycache__/", ".DS_Store"] | |
| ) | |
| print("Upload complete!") | |
| except Exception as e: | |
| print(f"Error during upload: {e}") | |
| print("\nNOTE: If this was a storage limitation error:") | |
| print("1. Verify Git LFS is enabled") | |
| print("2. Check HuggingFace storage quota") | |
| print("3. Ensure all files are under size limits") | |
| raise | |
| finally: | |
| conn.close() | |
| if __name__ == "__main__": | |
| verify_and_upload() | |