LyrGen2 / scripts /upload_embeddings.py
James Edmunds
Checkpoint! local working probably. new embeddings, added to readme, additional scripts, updated process_lyrics and upload_embeddings and added some testscripts.
d147321
"""Upload embeddings to HuggingFace Space"""
import sys
from pathlib import Path
from huggingface_hub import HfApi
import sqlite3
# Add parent directory to path to locate 'config' module
sys.path.append(str(Path(__file__).parent.parent))
from config.settings import Settings
def verify_space_access():
"""Verify access to the Space and its settings"""
api = HfApi(token=Settings.HF_TOKEN)
try:
# Check if we have write access
space_info = api.space_info(Settings.HF_SPACE)
print(f"Space runtime: {space_info.runtime}")
print(f"Space sdk: {space_info.sdk}")
# Try to list the contents to verify write access
contents = api.list_repo_files(
repo_id=Settings.HF_SPACE,
repo_type="space"
)
print(f"Space contents: {len(contents)} files")
return True
except Exception as e:
print(f"Error verifying space access: {str(e)}")
return False
def verify_and_upload():
"""Verify local embeddings and upload to HuggingFace"""
local_chroma = Settings.get_chroma_path()
# Verify local files first
print("\nVerifying local embeddings...")
sqlite_file = local_chroma / "chroma.sqlite3"
if not sqlite_file.exists():
raise RuntimeError(f"SQLite database not found at {sqlite_file}")
# Get collection info
conn = sqlite3.connect(sqlite_file)
cursor = conn.cursor()
try:
# Verify collection
cursor.execute("SELECT name, id FROM collections;")
collections = cursor.fetchall()
if not collections:
raise RuntimeError("No collections found in database")
print("\nCollections in database:")
for name, coll_id in collections:
# Get embeddings count
cursor.execute("""
SELECT COUNT(*)
FROM embeddings e
JOIN segments s ON e.segment_id = s.id
WHERE s.collection = ?
""", (coll_id,))
count = cursor.fetchone()[0]
print(f"Collection: {name}")
print(f"ID: {coll_id}")
print(f"Embeddings: {count}")
# Verify expected collection exists
collection_names = [c[0] for c in collections]
if Settings.CHROMA_COLLECTION_NAME not in collection_names:
raise RuntimeError(
f"Expected collection '{Settings.CHROMA_COLLECTION_NAME}' not found. "
f"Found: {collection_names}"
)
# Calculate total size
total_size = sum(
f.stat().st_size for f in local_chroma.glob('**/*') if f.is_file()
)
print(f"\nTotal size to upload: {total_size / (1024*1024):.2f} MB")
# Confirm before upload
response = input("\nReady to upload to HuggingFace. Continue? (y/N): ")
if response.lower() != 'y':
print("Upload cancelled.")
return
print("\nUploading to HuggingFace...")
api = HfApi(token=Settings.HF_TOKEN)
try:
# Upload files to the existing repo
api.upload_folder(
folder_path=str(local_chroma),
repo_id=Settings.HF_DATASET,
repo_type="dataset",
path_in_repo="chroma",
ignore_patterns=["*.pyc", "__pycache__/", ".DS_Store"]
)
print("Upload complete!")
except Exception as e:
print(f"Error during upload: {e}")
print("\nNOTE: If this was a storage limitation error:")
print("1. Verify Git LFS is enabled")
print("2. Check HuggingFace storage quota")
print("3. Ensure all files are under size limits")
raise
finally:
conn.close()
if __name__ == "__main__":
verify_and_upload()