Add initial project files including .gitignore, README, requirements, and twitter_unifier script

This commit is contained in:
2025-07-13 19:53:40 -03:00
commit d5ee4b91d5
4 changed files with 158 additions and 0 deletions

3
.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
prod
twitter_dump.csv
unified_projects.csv

1
README.md Normal file
View File

@@ -0,0 +1 @@
python3 -m venv prod && . prod/bin/activate

1
requirements.txt Normal file
View File

@@ -0,0 +1 @@
pandas>=1.5.0

153
twitter_unifier.py Normal file
View File

@@ -0,0 +1,153 @@
import pandas as pd
import json
import re
from urllib.parse import urlparse
def extract_github_url(text):
"""Extract GitHub URL from text, handling t.co redirects"""
tco_pattern = r'https://t\.co/\w+'
tco_matches = re.findall(tco_pattern, text)
return tco_matches[0] if tco_matches else None
def extract_media_info(media_str):
"""Extract media information from the media field"""
if not media_str or media_str == '[]':
return None, None, None
try:
media_list = json.loads(media_str)
if media_list and len(media_list) > 0:
media_item = media_list[0]
return (
media_item.get('type'),
media_item.get('thumbnail'),
media_item.get('original')
)
except (json.JSONDecodeError, KeyError):
pass
return None, None, None
def extract_project_url_from_card(metadata_str):
"""Extract project URL from card metadata in reply tweets"""
if not metadata_str:
return None
try:
metadata = json.loads(metadata_str)
if 'card' in metadata and 'legacy' in metadata['card'] and 'binding_values' in metadata['card']['legacy']:
binding_values = metadata['card']['legacy']['binding_values']
for binding in binding_values:
if binding.get('key') == 'card_url':
return binding.get('value', {}).get('string_value')
elif binding.get('key') == 'title':
title = binding.get('value', {}).get('string_value', '')
if 'github.com' in title.lower():
github_match = re.search(r'GitHub - ([^:]+)', title)
if github_match:
repo_path = github_match.group(1)
return f"https://github.com/{repo_path}"
if 'legacy' in metadata and 'entities' in metadata['legacy']:
entities = metadata['legacy']['entities']
if 'urls' in entities and entities['urls']:
return entities['urls'][0].get('expanded_url')
except (json.JSONDecodeError, KeyError):
pass
return None
def process_twitter_dump(input_file, output_file):
"""Process Twitter dump and create unified project CSV"""
# Read the CSV file, ensuring ID columns are treated as strings
# THIS IS THE KEY FIX:
df = pd.read_csv(input_file, dtype={'id': str, 'in_reply_to': str})
print(f"Loaded {len(df)} tweets from {input_file}")
# Sort by creation time to ensure proper ordering
df['created_at'] = pd.to_datetime(df['created_at'])
df = df.sort_values('created_at')
# Separate original tweets from replies
original_tweets = df[df['in_reply_to'].isnull() | (df['in_reply_to'] == 'null')].copy()
reply_tweets = df[df['in_reply_to'].notnull() & (df['in_reply_to'] != 'null')].copy()
print(f"Found {len(original_tweets)} original tweets and {len(reply_tweets)} reply tweets")
# Debug: Check types (should now both be 'object', which means string for pandas)
print(f"Original tweet ID types: {original_tweets['id'].dtype}")
print(f"Reply in_reply_to types: {reply_tweets['in_reply_to'].dtype}")
# Create unified projects list
unified_projects = []
for _, original_tweet in original_tweets.iterrows():
# tweet_id is already a string, no need for str() conversion
tweet_id = original_tweet['id']
description = re.sub(r'https://t\.co/\w+', '', original_tweet['full_text']).strip()
media_type, media_thumbnail, media_original = extract_media_info(original_tweet['media'])
# Look for corresponding reply tweet. Comparison is now string vs string.
reply_tweet = reply_tweets[reply_tweets['in_reply_to'] == tweet_id]
project_url = None
if not reply_tweet.empty:
reply_row = reply_tweet.iloc[0]
# print(f"Processing reply for tweet {tweet_id}") # You can re-enable this for verbosity
project_url = extract_project_url_from_card(reply_row.get('metadata'))
if not project_url:
project_url = extract_github_url(reply_row['full_text'])
else:
# This will now only print for tweets that truly don't have a reply in the dataset
print(f"No reply found for tweet {tweet_id}")
# Optional debug to check what's going on
# print(f" Match found: {tweet_id in reply_tweets['in_reply_to'].values}")
project_entry = {
'id': tweet_id,
'created_at': original_tweet['created_at'],
'project_description': description,
'project_url': project_url,
'media_type': media_type,
'media_thumbnail': media_thumbnail,
'media_original': media_original,
'author_screen_name': original_tweet['screen_name'],
'author_name': original_tweet['name'],
'favorite_count': original_tweet['favorite_count'],
'retweet_count': original_tweet['retweet_count'],
'reply_count': original_tweet['reply_count'],
'views_count': original_tweet['views_count']
}
unified_projects.append(project_entry)
output_df = pd.DataFrame(unified_projects)
output_df.to_csv(output_file, index=False)
print(f"Processed {len(unified_projects)} projects")
print(f"Output saved to {output_file}")
return output_df
if __name__ == "__main__":
input_file = "twitter_dump.csv"
output_file = "unified_projects.csv"
result_df = process_twitter_dump(input_file, output_file)
print("\nSample output:")
if not result_df.empty:
print(result_df.head())
if 'project_url' in result_df.columns:
missing_urls = result_df[result_df['project_url'].isnull()]
if not missing_urls.empty:
print(f"\nWarning: {len(missing_urls)} projects missing URLs:")
print(missing_urls[['id', 'project_description']].head())
else:
print("\nAll projects have URLs!")
else:
print("No data processed!")