Used mistral large to re-write the summarize process.

2025-09-01 10:09:56 +00:00 · 2024-08-19 07:41:32 -04:00 · 2024-08-19 07:41:32 -04:00 · 3c05cd78dd
commit 3c05cd78dd
parent 81e4e165bc
1 changed files with 38 additions and 72 deletions
--- a/summarize.py
+++ b/summarize.py
@ -21,23 +21,18 @@ CHUNK_SIZE = int(os.getenv("CHUNK_SIZE"))
 TEMPERATURE = float(os.getenv("TEMPERATURE"))

 def whisper_api(file):
-    # Whisper supports multiple files, but we're sending one
+    """Transcribe audio file using Whisper API."""
    files = {"file": file}
-    
-    # Required API call data
    api_data = {
        "temperature": "0.0",
        "response_format": "json"
    }
-
-    # Call API and return text
    response = requests.post(WHISPERCPP_URL, data=api_data, files=files)
    return response.json()["text"]

 def llama_api(prompt):
-    # Format prompt before sending
+    """Generate response using llama.cpp server API."""
    formatted_prompt = PROMPT_FORMAT.format(system=SYSTEM_MESSAGE, prompt=prompt)
-
    api_data = {
        "prompt": formatted_prompt,
        "n_predict": -1,
@ -45,19 +40,15 @@ def llama_api(prompt):
        "stop": [STOP_TOKEN],
        "tokens_cached": 0
    }
-
    response = requests.post(LLAMACPP_URL, headers={"Content-Type": "application/json"}, json=api_data)
    json_output = response.json()
    return json_output['content']

-# Use ffmpeg to trim silence in wav files, to prevent issues with 
-# whisper.cpp stopping the transcode if it detects a large amount of silence
 def trim_silence(filename):
-    # Create a temporary file for the output
+    """Trim silence from audio file using FFmpeg."""
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
        temp_filename = temp_file.name

-    # Construct the FFmpeg command
    ffmpeg_command = [
        "ffmpeg",
        "-i", filename,
@ -66,84 +57,59 @@ def trim_silence(filename):
        temp_filename
    ]

-    # Run the FFmpeg command
    result = subprocess.run(ffmpeg_command, capture_output=True, text=True, check=True)
-
-    # If FFmpeg command was successful, replace the original file
    os.replace(temp_filename, filename)

-# Iterate over each WAV file and transcode with whisper API
-wav_files = [f for f in os.listdir(".") if f.endswith(".wav")]
-for wav_file in wav_files:
+def process_wav_files():
+    """Process WAV files: trim silence and transcribe."""
+    wav_files = [f for f in os.listdir(".") if f.endswith(".wav")]
+    for wav_file in wav_files:
+        print("Trimming silence: " + wav_file)
+        trim_silence(wav_file)

-    # Trim silence on the wav file first
-    print("Trimming silence: " + wav_file)
-    trim_silence(wav_file)
-    
-    # Open the WAV file for sending to whisper REST API
-    with open(wav_file, "rb") as file:
-        print("Transcribing: " + wav_file)
-        # Call whisper API to transcode file
-        output_text = whisper_api(file)
+        with open(wav_file, "rb") as file:
+            print("Transcribing: " + wav_file)
+            output_text = whisper_api(file)
+            output_file = os.path.splitext(wav_file)[0] + ".tns"
+            with open(output_file, "w") as output:
+                output.write(output_text)

-        # Generate the output file name by replacing the extension with .tns
-        output_file = os.path.splitext(wav_file)[0] + ".tns"
-
-        # Write the output text to the file
-        with open(output_file, "w") as output:
-            output.write(output_text)
-        
-# Chunk the full transcript into multiple parts to fit in the context window
-# and allow for better reasoning capability
 def chunk_transcript(string, chunk_size):
+    """Chunk the transcript to fit in the context window."""
    chunks = []
-    lines = string.split("\n")  # Split the string on newline characters
+    lines = string.split("\n")
    current_chunk = ""
    for line in lines:
-        current_chunk += line  # Build up the string until the chunk size is reached
+        current_chunk += line
        if len(current_chunk) >= chunk_size:
            chunks.append(current_chunk)
            current_chunk = ""
-    if current_chunk:  # Add the last chunk if it's not empty
+    if current_chunk:
        chunks.append(current_chunk)
    return chunks

-# Get the current date in yyyymmdd format
-today = datetime.datetime.now().strftime('%Y%m%d')
+def summarize_transcripts():
+    """Summarize transcript files."""
+    today = datetime.datetime.now().strftime('%Y%m%d')
+    summary_filename = "summary-" + today + ".md"
+    transcript_files = [f for f in os.listdir(".") if f.endswith(".tns")]

-# Modify the filename by appending the current date
-summary_filename = "summary-" + today + ".md"
+    for transcript in transcript_files:
+        print("Summarizing: " + transcript)
+        with open(transcript, "r") as file:
+            transcript_data = file.read()
+            chunked_data = chunk_transcript(transcript_data, CHUNK_SIZE)

-# Get the list of transcript files in the current directory
-transcript_files = [f for f in os.listdir(".") if f.endswith(".tns")]
-
-# Iterate over each WAV file
-for transcript in transcript_files: 
-    print("Summarizing: " + transcript)
-
-    # Open the WAV file
-    with open(transcript, "r") as file:
-        transcript_data = file.read()
-
-        # chunk the transcript so we don't blow out the context window
-        chunked_data = chunk_transcript(transcript_data, CHUNK_SIZE)
-
-        # Iterate through the chunks, and summarize them
-        for i, chunk in enumerate(chunked_data):
            with open(summary_filename, "a") as md_file:
-                # Generate call summary
-                summary_prompt = SUMMARY_PROMPT.format(chunk=chunk)
-                summary = llama_api(summary_prompt)
+                for i, chunk in enumerate(chunked_data):
+                    summary = llama_api(SUMMARY_PROMPT.format(chunk=chunk))
+                    facts = llama_api(FACT_PROMPT.format(chunk=chunk))
+                    sentiment = llama_api(SENTIMENT_PROMPT.format(chunk=chunk))

-                # Generate fact summary
-                fact_prompt = FACT_PROMPT.format(chunk=chunk)
-                facts = llama_api(fact_prompt)
+                    md_file.write(f"# Call Transcript - {transcript} - Part {i + 1}\n\nSummary: {summary}\n\nFacts:\n{facts}\n\nSentiment: {sentiment}\n\n---\n")

-                # Generate call sentiment
-                sentiment_prompt = SENTIMENT_PROMPT.format(chunk=chunk)
-                sentiment = llama_api(sentiment_prompt)
+    print("Summarizing complete")

-                # Write the notes
-                md_file.write(f"# Call Transcript - {transcript} - Part {i + 1}\n\nSummary: {summary}\n\nFacts:\n{facts}\n\nSentiment: {sentiment}\n\n---\n")
-
-print("Summarizing complete")
+if __name__ == "__main__":
+    process_wav_files()
+    summarize_transcripts()