James Edmunds commited on
Commit
b2fcbcc
·
1 Parent(s): 69c0671

refactor: simplify to use direct Space storage instead of dataset

Browse files
config/settings.py CHANGED
@@ -34,7 +34,6 @@ class Settings:
34
 
35
  # HuggingFace Settings
36
  HF_SPACE = "SongLift/LyrGen2"
37
- HF_DATASET = "SongLift/LyrGen2_DB"
38
 
39
  @classmethod
40
  def is_huggingface(cls) -> bool:
@@ -44,6 +43,5 @@ class Settings:
44
  @classmethod
45
  def get_embeddings_path(cls) -> Path:
46
  """Get appropriate embeddings path based on deployment mode"""
47
- if cls.is_huggingface():
48
- return Path('/tmp/chroma_db/chroma') # Path where HF dataset is downloaded
49
- return cls.EMBEDDINGS_DIR
 
34
 
35
  # HuggingFace Settings
36
  HF_SPACE = "SongLift/LyrGen2"
 
37
 
38
  @classmethod
39
  def is_huggingface(cls) -> bool:
 
43
  @classmethod
44
  def get_embeddings_path(cls) -> Path:
45
  """Get appropriate embeddings path based on deployment mode"""
46
+ # Use same structure in both environments
47
+ return Path("/data/processed/embeddings")
 
scripts/upload_embeddings.py CHANGED
@@ -1,4 +1,4 @@
1
- """Upload embeddings to HuggingFace"""
2
  import sys
3
  from pathlib import Path
4
  from huggingface_hub import HfApi
@@ -10,7 +10,7 @@ from config.settings import Settings
10
 
11
 
12
  def main():
13
- """Upload embeddings directory to HuggingFace dataset"""
14
  print("Starting upload process...")
15
 
16
  # Print size info
@@ -21,11 +21,12 @@ def main():
21
 
22
  api = HfApi(token=Settings.HF_TOKEN)
23
 
24
- print(f"Uploading to {Settings.HF_DATASET}...")
25
  api.upload_folder(
26
  folder_path=str(Settings.EMBEDDINGS_DIR),
27
- repo_id=Settings.HF_DATASET,
28
- repo_type="dataset"
 
29
  )
30
 
31
  print("Upload complete!")
 
1
+ """Upload embeddings to HuggingFace Space"""
2
  import sys
3
  from pathlib import Path
4
  from huggingface_hub import HfApi
 
10
 
11
 
12
  def main():
13
+ """Upload embeddings directory to HuggingFace Space"""
14
  print("Starting upload process...")
15
 
16
  # Print size info
 
21
 
22
  api = HfApi(token=Settings.HF_TOKEN)
23
 
24
+ print(f"Uploading to Space: {Settings.HF_SPACE}...")
25
  api.upload_folder(
26
  folder_path=str(Settings.EMBEDDINGS_DIR),
27
+ repo_id=Settings.HF_SPACE,
28
+ repo_type="space",
29
+ path_in_repo="data/processed/embeddings"
30
  )
31
 
32
  print("Upload complete!")
src/generator/generator.py CHANGED
@@ -29,88 +29,51 @@ class LyricGenerator:
29
 
30
  def _load_embeddings(self) -> None:
31
  """Load existing embeddings based on environment"""
32
- if Settings.is_huggingface():
33
- try:
34
- print(f"Loading embeddings from HF dataset: {Settings.HF_DATASET}")
35
- # Download dataset to Space's storage
36
- local_dir = Path(snapshot_download(
37
- repo_id=Settings.HF_DATASET,
38
- repo_type="dataset",
39
- local_dir="/tmp/chroma_db"
40
- ))
41
- print(f"Dataset downloaded to: {local_dir}")
42
-
43
- # List contents to debug
44
- print("Contents of downloaded directory:")
45
- for path in local_dir.rglob("*"):
46
- print(f" {path}")
47
-
48
- # Check if chroma directory exists
49
- chroma_dir = local_dir / "chroma"
50
- if not chroma_dir.exists():
51
- # Try looking in the root
52
- chroma_dir = local_dir
53
- if not (chroma_dir / "chroma.sqlite3").exists():
54
- raise RuntimeError(
55
- f"Chroma files not found in {local_dir}"
56
- )
57
-
58
- print(f"Loading Chroma DB from: {chroma_dir}")
59
- # Initialize vector store from the cached location
60
- self.vector_store = Chroma(
61
- persist_directory=str(chroma_dir),
62
- embedding_function=self.embeddings,
63
- collection_name="lyrics"
64
- )
65
- print("Successfully loaded vector store")
66
-
67
- # Verify collection has documents
68
- try:
69
- collection = self.vector_store._collection
70
- count = collection.count()
71
- print(f"Collection contains {count} documents")
72
-
73
- if count == 0:
74
- # Try to peek at the collection data
75
- print("Checking collection details...")
76
- peek = collection.peek()
77
- print(f"Collection peek: {peek}")
78
-
79
- raise RuntimeError(
80
- "Chroma DB is empty. Please ensure embeddings were "
81
- "properly uploaded to the dataset."
82
- )
83
- except Exception as e:
84
- print(f"Error checking collection: {str(e)}")
85
- raise RuntimeError(f"Failed to verify collection: {str(e)}")
86
-
87
- except Exception as e:
88
- print(f"Error loading HF embeddings: {str(e)}")
89
- raise RuntimeError(f"Failed to load HF embeddings: {str(e)}")
90
- else:
91
- if not self.embeddings_dir.exists():
92
  raise RuntimeError(
93
- "Embeddings not found locally. "
94
- "Please run process_lyrics.py first."
95
  )
 
 
96
 
97
- try:
98
- print(f"Loading local vector store from: {self.embeddings_dir}")
99
- # Load vector store using environment-aware settings
100
- self.vector_store = Chroma(
101
- persist_directory=str(self.embeddings_dir),
102
- embedding_function=self.embeddings,
103
- collection_name="lyrics"
104
- )
105
-
106
- # Verify collection has documents
107
- collection = self.vector_store._collection
108
- count = collection.count()
109
- print(f"Collection contains {count} documents")
110
-
111
- except Exception as e:
112
- print(f"Error loading local embeddings: {str(e)}")
113
- raise RuntimeError(f"Failed to load local embeddings: {str(e)}")
114
 
115
  # Setup QA chain
116
  self._setup_qa_chain()
 
29
 
30
  def _load_embeddings(self) -> None:
31
  """Load existing embeddings based on environment"""
32
+ try:
33
+ print(f"Loading vector store from: {self.embeddings_dir}")
34
+ # Check Chroma directory structure
35
+ chroma_dir = self.embeddings_dir / "chroma"
36
+ print(f"Checking Chroma directory: {chroma_dir}")
37
+ if not chroma_dir.exists():
38
+ raise RuntimeError(f"Chroma directory not found at {chroma_dir}")
39
+
40
+ sqlite_file = chroma_dir / "chroma.sqlite3"
41
+ print(f"Checking SQLite file: {sqlite_file}")
42
+ if not sqlite_file.exists():
43
+ raise RuntimeError(f"Chroma database not found at {sqlite_file}")
44
+ print(f"SQLite file size: {sqlite_file.stat().st_size / (1024*1024):.2f} MB")
45
+
46
+ # Load vector store using environment-aware settings
47
+ print("Initializing Chroma with settings:")
48
+ print(f" persist_directory: {str(chroma_dir)}")
49
+ print(f" collection_name: lyrics")
50
+
51
+ self.vector_store = Chroma(
52
+ persist_directory=str(chroma_dir),
53
+ embedding_function=self.embeddings,
54
+ collection_name="lyrics"
55
+ )
56
+
57
+ # Verify collection has documents
58
+ collection = self.vector_store._collection
59
+ count = collection.count()
60
+ print(f"Collection contains {count} documents")
61
+
62
+ if count == 0:
63
+ print("Collection is empty, checking details...")
64
+ # Try to peek at the collection data
65
+ peek = collection.peek()
66
+ print(f"Collection peek: {peek}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  raise RuntimeError(
68
+ "Chroma DB is empty. Please ensure embeddings "
69
+ "were properly generated and uploaded."
70
  )
71
+ else:
72
+ print("Successfully loaded embeddings")
73
 
74
+ except Exception as e:
75
+ print(f"Error loading embeddings: {str(e)}")
76
+ raise RuntimeError(f"Failed to load embeddings: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  # Setup QA chain
79
  self._setup_qa_chain()