mirror of
https://github.com/FranP-code/format_twitter_projects_accounts_tweets.git
synced 2025-10-13 00:32:19 +00:00
Add initial project files including .gitignore, README, requirements, and twitter_unifier script
This commit is contained in:
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
prod
|
||||
twitter_dump.csv
|
||||
unified_projects.csv
|
||||
1
requirements.txt
Normal file
1
requirements.txt
Normal file
@@ -0,0 +1 @@
|
||||
pandas>=1.5.0
|
||||
153
twitter_unifier.py
Normal file
153
twitter_unifier.py
Normal file
@@ -0,0 +1,153 @@
|
||||
import pandas as pd
|
||||
import json
|
||||
import re
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
def extract_github_url(text):
|
||||
"""Extract GitHub URL from text, handling t.co redirects"""
|
||||
tco_pattern = r'https://t\.co/\w+'
|
||||
tco_matches = re.findall(tco_pattern, text)
|
||||
return tco_matches[0] if tco_matches else None
|
||||
|
||||
|
||||
def extract_media_info(media_str):
|
||||
"""Extract media information from the media field"""
|
||||
if not media_str or media_str == '[]':
|
||||
return None, None, None
|
||||
try:
|
||||
media_list = json.loads(media_str)
|
||||
if media_list and len(media_list) > 0:
|
||||
media_item = media_list[0]
|
||||
return (
|
||||
media_item.get('type'),
|
||||
media_item.get('thumbnail'),
|
||||
media_item.get('original')
|
||||
)
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
pass
|
||||
return None, None, None
|
||||
|
||||
|
||||
def extract_project_url_from_card(metadata_str):
|
||||
"""Extract project URL from card metadata in reply tweets"""
|
||||
if not metadata_str:
|
||||
return None
|
||||
try:
|
||||
metadata = json.loads(metadata_str)
|
||||
if 'card' in metadata and 'legacy' in metadata['card'] and 'binding_values' in metadata['card']['legacy']:
|
||||
binding_values = metadata['card']['legacy']['binding_values']
|
||||
for binding in binding_values:
|
||||
if binding.get('key') == 'card_url':
|
||||
return binding.get('value', {}).get('string_value')
|
||||
elif binding.get('key') == 'title':
|
||||
title = binding.get('value', {}).get('string_value', '')
|
||||
if 'github.com' in title.lower():
|
||||
github_match = re.search(r'GitHub - ([^:]+)', title)
|
||||
if github_match:
|
||||
repo_path = github_match.group(1)
|
||||
return f"https://github.com/{repo_path}"
|
||||
if 'legacy' in metadata and 'entities' in metadata['legacy']:
|
||||
entities = metadata['legacy']['entities']
|
||||
if 'urls' in entities and entities['urls']:
|
||||
return entities['urls'][0].get('expanded_url')
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def process_twitter_dump(input_file, output_file):
|
||||
"""Process Twitter dump and create unified project CSV"""
|
||||
|
||||
# Read the CSV file, ensuring ID columns are treated as strings
|
||||
# THIS IS THE KEY FIX:
|
||||
df = pd.read_csv(input_file, dtype={'id': str, 'in_reply_to': str})
|
||||
|
||||
print(f"Loaded {len(df)} tweets from {input_file}")
|
||||
|
||||
# Sort by creation time to ensure proper ordering
|
||||
df['created_at'] = pd.to_datetime(df['created_at'])
|
||||
df = df.sort_values('created_at')
|
||||
|
||||
# Separate original tweets from replies
|
||||
original_tweets = df[df['in_reply_to'].isnull() | (df['in_reply_to'] == 'null')].copy()
|
||||
reply_tweets = df[df['in_reply_to'].notnull() & (df['in_reply_to'] != 'null')].copy()
|
||||
|
||||
print(f"Found {len(original_tweets)} original tweets and {len(reply_tweets)} reply tweets")
|
||||
|
||||
# Debug: Check types (should now both be 'object', which means string for pandas)
|
||||
print(f"Original tweet ID types: {original_tweets['id'].dtype}")
|
||||
print(f"Reply in_reply_to types: {reply_tweets['in_reply_to'].dtype}")
|
||||
|
||||
# Create unified projects list
|
||||
unified_projects = []
|
||||
|
||||
for _, original_tweet in original_tweets.iterrows():
|
||||
# tweet_id is already a string, no need for str() conversion
|
||||
tweet_id = original_tweet['id']
|
||||
|
||||
description = re.sub(r'https://t\.co/\w+', '', original_tweet['full_text']).strip()
|
||||
media_type, media_thumbnail, media_original = extract_media_info(original_tweet['media'])
|
||||
|
||||
# Look for corresponding reply tweet. Comparison is now string vs string.
|
||||
reply_tweet = reply_tweets[reply_tweets['in_reply_to'] == tweet_id]
|
||||
|
||||
project_url = None
|
||||
if not reply_tweet.empty:
|
||||
reply_row = reply_tweet.iloc[0]
|
||||
# print(f"Processing reply for tweet {tweet_id}") # You can re-enable this for verbosity
|
||||
|
||||
project_url = extract_project_url_from_card(reply_row.get('metadata'))
|
||||
if not project_url:
|
||||
project_url = extract_github_url(reply_row['full_text'])
|
||||
else:
|
||||
# This will now only print for tweets that truly don't have a reply in the dataset
|
||||
print(f"No reply found for tweet {tweet_id}")
|
||||
# Optional debug to check what's going on
|
||||
# print(f" Match found: {tweet_id in reply_tweets['in_reply_to'].values}")
|
||||
|
||||
project_entry = {
|
||||
'id': tweet_id,
|
||||
'created_at': original_tweet['created_at'],
|
||||
'project_description': description,
|
||||
'project_url': project_url,
|
||||
'media_type': media_type,
|
||||
'media_thumbnail': media_thumbnail,
|
||||
'media_original': media_original,
|
||||
'author_screen_name': original_tweet['screen_name'],
|
||||
'author_name': original_tweet['name'],
|
||||
'favorite_count': original_tweet['favorite_count'],
|
||||
'retweet_count': original_tweet['retweet_count'],
|
||||
'reply_count': original_tweet['reply_count'],
|
||||
'views_count': original_tweet['views_count']
|
||||
}
|
||||
|
||||
unified_projects.append(project_entry)
|
||||
|
||||
output_df = pd.DataFrame(unified_projects)
|
||||
output_df.to_csv(output_file, index=False)
|
||||
|
||||
print(f"Processed {len(unified_projects)} projects")
|
||||
print(f"Output saved to {output_file}")
|
||||
|
||||
return output_df
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
input_file = "twitter_dump.csv"
|
||||
output_file = "unified_projects.csv"
|
||||
|
||||
result_df = process_twitter_dump(input_file, output_file)
|
||||
|
||||
print("\nSample output:")
|
||||
if not result_df.empty:
|
||||
print(result_df.head())
|
||||
if 'project_url' in result_df.columns:
|
||||
missing_urls = result_df[result_df['project_url'].isnull()]
|
||||
if not missing_urls.empty:
|
||||
print(f"\nWarning: {len(missing_urls)} projects missing URLs:")
|
||||
print(missing_urls[['id', 'project_description']].head())
|
||||
else:
|
||||
print("\nAll projects have URLs!")
|
||||
else:
|
||||
print("No data processed!")
|
||||
Reference in New Issue
Block a user