From d5ee4b91d58faca55b4049844fec9dcccff9fd84 Mon Sep 17 00:00:00 2001
From: Francisco Pessano <franpessano1@gmail.com>
Date: Sun, 13 Jul 2025 19:53:40 -0300
Subject: [PATCH] Add initial project files including .gitignore, README,
 requirements, and twitter_unifier script

---
 .gitignore         |   3 +
 README.md          |   1 +
 requirements.txt   |   1 +
 twitter_unifier.py | 153 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 158 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100644 requirements.txt
 create mode 100644 twitter_unifier.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..648ac40
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+prod
+twitter_dump.csv
+unified_projects.csv
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8272cce
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+python3 -m venv prod && . prod/bin/activate
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..1210a2d
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+pandas>=1.5.0
diff --git a/twitter_unifier.py b/twitter_unifier.py
new file mode 100644
index 0000000..48208ed
--- /dev/null
+++ b/twitter_unifier.py
@@ -0,0 +1,153 @@
+import pandas as pd
+import json
+import re
+from urllib.parse import urlparse
+
+
+def extract_github_url(text):
+    """Extract GitHub URL from text, handling t.co redirects"""
+    tco_pattern = r'https://t\.co/\w+'
+    tco_matches = re.findall(tco_pattern, text)
+    return tco_matches[0] if tco_matches else None
+
+
+def extract_media_info(media_str):
+    """Extract media information from the media field"""
+    if not media_str or media_str == '[]':
+        return None, None, None
+    try:
+        media_list = json.loads(media_str)
+        if media_list and len(media_list) > 0:
+            media_item = media_list[0]
+            return (
+                media_item.get('type'),
+                media_item.get('thumbnail'),
+                media_item.get('original')
+            )
+    except (json.JSONDecodeError, KeyError):
+        pass
+    return None, None, None
+
+
+def extract_project_url_from_card(metadata_str):
+    """Extract project URL from card metadata in reply tweets"""
+    if not metadata_str:
+        return None
+    try:
+        metadata = json.loads(metadata_str)
+        if 'card' in metadata and 'legacy' in metadata['card'] and 'binding_values' in metadata['card']['legacy']:
+            binding_values = metadata['card']['legacy']['binding_values']
+            for binding in binding_values:
+                if binding.get('key') == 'card_url':
+                    return binding.get('value', {}).get('string_value')
+                elif binding.get('key') == 'title':
+                    title = binding.get('value', {}).get('string_value', '')
+                    if 'github.com' in title.lower():
+                        github_match = re.search(r'GitHub - ([^:]+)', title)
+                        if github_match:
+                            repo_path = github_match.group(1)
+                            return f"https://github.com/{repo_path}"
+        if 'legacy' in metadata and 'entities' in metadata['legacy']:
+            entities = metadata['legacy']['entities']
+            if 'urls' in entities and entities['urls']:
+                return entities['urls'][0].get('expanded_url')
+    except (json.JSONDecodeError, KeyError):
+        pass
+    return None
+
+
+def process_twitter_dump(input_file, output_file):
+    """Process Twitter dump and create unified project CSV"""
+    
+    # Read the CSV file, ensuring ID columns are treated as strings
+    # THIS IS THE KEY FIX:
+    df = pd.read_csv(input_file, dtype={'id': str, 'in_reply_to': str})
+    
+    print(f"Loaded {len(df)} tweets from {input_file}")
+    
+    # Sort by creation time to ensure proper ordering
+    df['created_at'] = pd.to_datetime(df['created_at'])
+    df = df.sort_values('created_at')
+    
+    # Separate original tweets from replies
+    original_tweets = df[df['in_reply_to'].isnull() | (df['in_reply_to'] == 'null')].copy()
+    reply_tweets = df[df['in_reply_to'].notnull() & (df['in_reply_to'] != 'null')].copy()
+    
+    print(f"Found {len(original_tweets)} original tweets and {len(reply_tweets)} reply tweets")
+
+    # Debug: Check types (should now both be 'object', which means string for pandas)
+    print(f"Original tweet ID types: {original_tweets['id'].dtype}")
+    print(f"Reply in_reply_to types: {reply_tweets['in_reply_to'].dtype}")
+
+    # Create unified projects list
+    unified_projects = []
+    
+    for _, original_tweet in original_tweets.iterrows():
+        # tweet_id is already a string, no need for str() conversion
+        tweet_id = original_tweet['id']
+        
+        description = re.sub(r'https://t\.co/\w+', '', original_tweet['full_text']).strip()
+        media_type, media_thumbnail, media_original = extract_media_info(original_tweet['media'])
+        
+        # Look for corresponding reply tweet. Comparison is now string vs string.
+        reply_tweet = reply_tweets[reply_tweets['in_reply_to'] == tweet_id]
+        
+        project_url = None
+        if not reply_tweet.empty:
+            reply_row = reply_tweet.iloc[0]
+            # print(f"Processing reply for tweet {tweet_id}") # You can re-enable this for verbosity
+            
+            project_url = extract_project_url_from_card(reply_row.get('metadata'))
+            if not project_url:
+                project_url = extract_github_url(reply_row['full_text'])
+        else:
+            # This will now only print for tweets that truly don't have a reply in the dataset
+            print(f"No reply found for tweet {tweet_id}")
+            # Optional debug to check what's going on
+            # print(f"  Match found: {tweet_id in reply_tweets['in_reply_to'].values}")
+
+        project_entry = {
+            'id': tweet_id,
+            'created_at': original_tweet['created_at'],
+            'project_description': description,
+            'project_url': project_url,
+            'media_type': media_type,
+            'media_thumbnail': media_thumbnail,
+            'media_original': media_original,
+            'author_screen_name': original_tweet['screen_name'],
+            'author_name': original_tweet['name'],
+            'favorite_count': original_tweet['favorite_count'],
+            'retweet_count': original_tweet['retweet_count'],
+            'reply_count': original_tweet['reply_count'],
+            'views_count': original_tweet['views_count']
+        }
+        
+        unified_projects.append(project_entry)
+    
+    output_df = pd.DataFrame(unified_projects)
+    output_df.to_csv(output_file, index=False)
+    
+    print(f"Processed {len(unified_projects)} projects")
+    print(f"Output saved to {output_file}")
+    
+    return output_df
+
+
+if __name__ == "__main__":
+    input_file = "twitter_dump.csv"
+    output_file = "unified_projects.csv"
+    
+    result_df = process_twitter_dump(input_file, output_file)
+    
+    print("\nSample output:")
+    if not result_df.empty:
+        print(result_df.head())
+        if 'project_url' in result_df.columns:
+            missing_urls = result_df[result_df['project_url'].isnull()]
+            if not missing_urls.empty:
+                print(f"\nWarning: {len(missing_urls)} projects missing URLs:")
+                print(missing_urls[['id', 'project_description']].head())
+            else:
+                print("\nAll projects have URLs!")
+    else:
+        print("No data processed!")
\ No newline at end of file