From Wikipedia, I collected data about the top 10 Bollywood hits each year from 1947-2022. I used OpenAI's models to classify each movie as 'female-led' or not, and to assign genders to the cast and the movie director. All these help us paint the picture of female representation over the years, specifically among Bollywood hits. Read more about this data and how it was collected on the Kaggle dataset page that I created.
I am using three measures, all imperfect, to understand the representation of women in Bollywood hits.
While each of these measures gives an incomplete picture, the combination of all three is more reliable and tells an interesting story.
import json
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import math
with open('hindi_hits_from_1947_to_2022_ai_enhanced.json', 'r') as f:
lines = set(f.readlines())
movies = json.loads("[" + ",".join(lines) + "]")
print("Loaded {} movies".format(len(movies)))
Loaded 740 movies
# a given gender's score is proportional to two factors:
# 1) how many actors of that gender there are
# 2) how high up in the cast list they are
def female_cast_importance(cast):
score = {'male': 0, 'female': 0}
for i, actor in enumerate(cast):
gender = actor['gender'].lower()
if gender in ['male', 'female']:
score[gender] += 1 / math.log(i+2) # Higher weight for actors earlier in the list
return score['female'] / (score['male'] + score['female'])
ex = female_cast_importance([{'gender': 'male'}, {'gender': 'male'}, {'gender': 'male'}, {'gender': 'female'}])
print("female-cast-importance if the cast is 'male, male, male, female' is {}".format(ex))
female-cast-importance if the cast is 'male, male, male, female' is 0.16812753627111746
rows = []
for movie in movies:
if movie['title'] == "Roti":
# this one's broken
continue
cast_string = ", ".join([x['name'] for x in movie['cast'][:4]])
# put all of these into a pandas dataframe
rows.append({
'year': movie['year'],
'title_link': '<a href="' + movie['movie_url'] + '">' + movie['title'] + '</a>',
'title': movie['title'],
'female-led': movie['female-led'],
'female_cast_importance': female_cast_importance(movie['cast']),
'female_director': movie['director']['gender'].lower() == "female",
'director_name': movie['director']['name'],
'actors': cast_string
})
df = pd.DataFrame(rows)
# Manually override some movies that were incorrectly classified
female_led_false_positives = [
"Mujhse Dosti Karoge!",
"Pardes",
"Anari",
"Amrit",
"Judaai",
"Tum Haseen Main Jawan",
"Sharafat",
"Aya Sawan Jhoom Ke",
"An Evening in Paris",
"Phool Aur Patthar",
"Guide",
"Gehra Daag",
"Dil Apna Aur Preet Parayi",
"Phagun",
"Phir Subah Hogi",
"Jagriti",
"Sangdil",
"Jadoo",
"Arzoo",
"Saathiya",
"Ready",
"Goliyon Ki Raasleela Ram-Leela",
"Badrinath Ki Dulhania"
]
df.loc[df['title'].isin(female_led_false_positives), 'female-led'] = False
pd.set_option('display.float_format', '{:.2f}'.format)
# female led, sort by year
subset = df[df["female-led"]==True].sort_values(by="year")
display(HTML(subset.to_html(index=False, escape=False)))
year | title_link | title | female-led | female_cast_importance | female_director | director_name | actors |
---|---|---|---|---|---|---|---|
1947 | Neel Kamal | Neel Kamal | True | 0.70 | False | Kidar Sharma | Begum Para, Raj Kapoor, Madhubala |
1947 | Mirza Sahiban | Mirza Sahiban | True | 0.61 | False | K. Amarnath | Nurjehan, Trilok Kapoor |
1949 | Bari Behen | Bari Behen | True | 0.49 | False | D. D. Kashyap | Suraiya, Rehman, Ullhas, Geeta Bali |
1949 | Singaar | Singaar | True | 0.77 | False | J. K. Nanda | Suraiya, Madhubala, Jairaj |
1950 | Beqasoor | Beqasoor | True | 0.39 | False | K. Amarnath | Madhubala, Ajit, Yakub, Gope |
1951 | Hum Log | Hum Log | True | 0.75 | False | Zia Sarhadi | Nutan, Balraj Sahni, Shyama, Durga Khote |
1952 | Anhonee | Anhonee | True | 0.39 | False | K. A. Abbas | Raj Kapoor, Nargis |
1953 | Parineeta | Parineeta | True | 0.39 | False | Bimal Roy | Ashok Kumar, Meena Kumari |
1957 | Mother India | Mother India | True | 0.39 | False | Mehboob Khan | Nargis, Sunil Dutt, Rajendra Kumar, Raaj Kumar |
1958 | Sadhna | Sadhna | True | 0.31 | False | B.R. Chopra | Sunil Dutt, Vyjayanthimala, Leela Chitnis, Radhakrishan |
1959 | Sujata | Sujata | True | 0.70 | False | Bimal Roy | Nutan, Sunil Dutt, Shashikala |
1962 | Anpadh | Anpadh | True | 0.45 | False | Mohan Kumar | Balraj Sahni, Dharmendra, Mala Sinha, Shashikala |
1963 | Bandini | Bandini | True | 0.47 | False | Bimal Roy | Nutan, Ashok Kumar, Dharmendra |
1967 | Raat Aur Din | Raat Aur Din | True | 0.26 | False | Satyen Bose | Pradeep Kumar, Nargis, Feroz Khan, K N Singh |
1968 | Neel Kamal | Neel Kamal | True | 0.47 | False | Ram Maheshwari | Waheeda Rehman, Manoj Kumar, Raaj Kumar |
1969 | Aradhana | Aradhana | True | 0.43 | False | Shakti Samanta | Sharmila Tagore, Rajesh Khanna, Sujit Kumar, Farida Jalal |
1970 | Tum Haseen Main Jawaan | Tum Haseen Main Jawaan | True | 0.36 | False | Bhappi Sonie | Dharmendra, Hema Malini, Pran, Helen |
1986 | Nagina | Nagina | True | 0.57 | False | Harmesh Malhotra | Sridevi, Rishi Kapoor, Komal Mahuvakar, Amrish Puri |
1986 | Chameli Ki Shaadi | Chameli Ki Shaadi | True | 0.39 | False | Basu Chatterjee | Anil Kapoor, Amrita Singh |
1988 | Khoon Bhari Maang | Khoon Bhari Maang | True | 0.51 | False | Rakesh Roshan | Rekha, Kabir Bedi, Sonu Walia, Shatrughan Sinha |
1991 | Lamhe | Lamhe | True | 0.51 | False | Yash Chopra | Sridevi, Anil Kapoor, Waheeda Rehman, Anupam Kher |
1993 | Damini | Damini | True | 0.39 | False | Rajkumar Santoshi | Meenakshi Sheshadri, Sunny Deol, Rishi Kapoor, Amrish Puri |
1994 | Laadla | Laadla | True | 0.48 | False | Raj Kanwar | Sridevi, Anil Kapoor, Raveena Tandon, Anupam Kher |
1996 | Bandit Queen | Bandit Queen | True | 0.61 | False | Shekhar Kapur | Seema Biswas, Nirmal Pandey |
1999 | Taal | Taal | True | 0.34 | False | Subhash Ghai | Aishwarya Rai, Akshaye Khanna, Anil Kapoor, Amrish Puri |
2001 | Lajja | Lajja | True | 0.70 | False | Rajkumar Santoshi | Manisha Koirala, Rekha, Madhuri Dixit, Mahima Chaudhry |
2016 | Dangal | Dangal | True | 0.59 | False | Nitesh Tiwari | Aamir Khan, Sakshi Tanwar, Fatima Sana Shaikh, Zaira Wasim |
2017 | Secret Superstar | Secret Superstar | True | 0.59 | False | Advait Chandan | Zaira Wasim, Aamir Khan, Meher Vij, Raj Arjun |
2017 | Toilet: Ek Prem Katha | Toilet: Ek Prem Katha | True | 0.39 | False | Shree Narayan Singh | Akshay Kumar, Bhumi Pednekar |
2018 | Hichki | Hichki | True | 1.00 | False | Siddharth P. Malhotra | Rani Mukerji |
2018 | Pad Man | Pad Man | True | 0.39 | False | R. Balki | Akshay Kumar, Radhika Apte |
2020 | Chhapaak | Chhapaak | True | 0.61 | True | Meghna Gulzar | Deepika Padukone, Vikrant Massey |
2020 | Panga | Panga | True | 0.62 | True | Ashwiny Iyer Tiwari | Kangana Ranaut, Jassi Gill, Yagya Bhasin, Richa Chadda |
2020 | Thappad | Thappad | True | 1.00 | False | Anubhav Sinha | Taapsee Pannu |
2022 | Gangubai Kathiawadi | Gangubai Kathiawadi | True | 0.59 | False | Sanjay Leela Bhansali | Alia Bhatt, Shantanu Maheshwari, Seema Pahwa, Jim Sarbh |
2022 | Jugjugg Jeeyo | Jugjugg Jeeyo | True | 0.54 | False | Raj Mehta | Neetu Kapoor, Anil Kapoor, Varun Dhawan, Kiara Advani |
subset = df[df["female_director"]][["year", "title_link", "director_name", "actors", "female-led", "female_cast_importance"]]
display(HTML(subset.to_html(escape=False)))
year | title_link | director_name | actors | female-led | female_cast_importance | |
---|---|---|---|---|---|---|
180 | 2010 | Tees Maar Khan | Farah Khan | Akshay Kumar, Akshaye Khanna, Katrina Kaif | False | 0.23 |
225 | 2007 | Om Shanti Om | Farah Khan | Shah Rukh Khan, Deepika Padukone, Shreyas Talpade, Kirron Kher | False | 0.36 |
267 | 2014 | Happy New Year | Farah Khan | Deepika Padukone, Shah Rukh Khan, Abhishek Bachchan, Sonu Sood | False | 0.27 |
289 | 2019 | Gully Boy | Zoya Akhtar | Ranveer Singh, Alia Bhatt, Siddhant Chaturvedi | False | 0.30 |
320 | 2011 | Zindagi Na Milegi Dobara | Zoya Akhtar | Hrithik Roshan, Abhay Deol, Farhan Akhtar, Katrina Kaif | False | 0.28 |
347 | 2020 | Chhapaak | Meghna Gulzar | Deepika Padukone, Vikrant Massey | True | 0.61 |
483 | 2004 | Main Hoon Na | Farah Khan | Shah Rukh Khan, Sushmita Sen, Sunil Shetty, Amrita Rao | False | 0.36 |
552 | 2020 | Panga | Ashwiny Iyer Tiwari | Kangana Ranaut, Jassi Gill, Yagya Bhasin, Richa Chadda | True | 0.62 |
596 | 2015 | Dil Dhadakne Do | Zoya Akhtar | Anil Kapoor, Shefali Shah, Priyanka Chopra, Ranveer Singh | False | 0.46 |
641 | 2012 | Talaash: The Answer Lies Within | Reema Kagti | Aamir Khan, Kareena Kapoor, Rani Mukerji, Nawazuddin Siddiqui | False | 0.45 |
subset = df[df["female_cast_importance"] >= 0.6]
# Select specific columns
subset = subset[["year", "title_link", "director_name", "actors", "female_cast_importance"]]
subset = subset.sort_values(by="female_cast_importance", ascending=False)
display(HTML(subset.to_html(escape=False)))
year | title_link | director_name | actors | female_cast_importance | |
---|---|---|---|---|---|
737 | 2020 | Thappad | Anubhav Sinha | Taapsee Pannu | 1.00 |
659 | 2018 | Hichki | Siddharth P. Malhotra | Rani Mukerji | 1.00 |
704 | 1947 | Dard | Abdul Rashid Kardar | Munawwar Sultana, Suraiya, Nusrat, Husn Banu | 0.80 |
575 | 1949 | Singaar | J. K. Nanda | Suraiya, Madhubala, Jairaj | 0.77 |
735 | 1951 | Hum Log | Zia Sarhadi | Nutan, Balraj Sahni, Shyama, Durga Khote | 0.75 |
317 | 2001 | Lajja | Rajkumar Santoshi | Manisha Koirala, Rekha, Madhuri Dixit, Mahima Chaudhry | 0.70 |
220 | 2000 | Fiza | Khalid Mohammed | Karisma Kapoor, Hrithik Roshan, Jaya Bachchan | 0.70 |
671 | 1947 | Neel Kamal | Kidar Sharma | Begum Para, Raj Kapoor, Madhubala | 0.70 |
528 | 2022 | Bhool Bhulaiyaa 2 | Anees Bazmee | Tabu, Kartik Aaryan, Kiara Advani | 0.70 |
625 | 1960 | Barsaat Ki Raat | P. L. Santoshi | Madhubala, Bharat Bhushan, Shyama | 0.70 |
385 | 1959 | Sujata | Bimal Roy | Nutan, Sunil Dutt, Shashikala | 0.70 |
405 | 1955 | Mr. & Mrs. '55 | Guru Dutt | Madhubala, Guru Dutt, Lalita Pawar | 0.70 |
648 | 1978 | Main Tulsi Tere Aangan Ki | Raj Khosla | Nutan, Vinod Khanna, Asha Parekh, Trilok Kapoor | 0.64 |
694 | 2007 | Chak De! India | Shimit Amin | Shah Rukh Khan, Vidya Malvade, Shilpa Shukla, Sagarika Ghatge | 0.63 |
654 | 1986 | Love 86 | Esmayeel Shroff | Tanuja, Rohan Kapoor, Govinda, Farha Naaz | 0.62 |
552 | 2020 | Panga | Ashwiny Iyer Tiwari | Kangana Ranaut, Jassi Gill, Yagya Bhasin, Richa Chadda | 0.62 |
639 | 1985 | Ram Teri Ganga Maili | Raj Kapoor | Mandakini, Rajiv Kapoor | 0.61 |
34 | 1996 | Bandit Queen | Shekhar Kapur | Seema Biswas, Nirmal Pandey | 0.61 |
359 | 2013 | Goliyon Ki Raasleela Ram-Leela | Sanjay Leela Bhansali | Deepika Padukone, Ranveer Singh | 0.61 |
48 | 2013 | Chennai Express | Rohit Shetty | Deepika Padukone, Shah Rukh Khan | 0.61 |
139 | 1951 | Baadal | Amiya Chakravarty | Madhubala, Prem Nath | 0.61 |
141 | 1994 | Hum Aapke Hain Koun..! | Sooraj Barjatya | Madhuri Dixit, Salman Khan | 0.61 |
304 | 1958 | Phagun | Bibhuti Mitra | Madhubala, Bharat Bhushan | 0.61 |
334 | 1948 | 11 O'Clock | Homi Wadia | Fearless Nadia, John Cawas | 0.61 |
347 | 2020 | Chhapaak | Meghna Gulzar | Deepika Padukone, Vikrant Massey | 0.61 |
480 | 2002 | Saathiya | Shaad Ali | Rani Mukerji, Vivek Oberoi | 0.61 |
373 | 1947 | Mirza Sahiban | K. Amarnath | Nurjehan, Trilok Kapoor | 0.61 |
418 | 1949 | Dillagi | A. R. Kardar | Suraiya, Shyam | 0.61 |
372 | 1960 | Kohinoor | S. U. Sunny | Dilip Kumar, Meena Kumari, Leela Chitnis, Kumkum | 0.61 |
630 | 1961 | Hum Dono | Amarjeet | Dev Anand, Sadhana, Nanda, Leela Chitnis | 0.61 |
315 | 1998 | Kuch Kuch Hota Hai | Karan Johar | Shah Rukh Khan, Kajol, Rani Mukerji, Sana Saeed | 0.61 |
553 | 1983 | Justice Chaudhury | K. Raghavendra Rao | Jeetendra, Sridevi, Hema Malini, Moushumi Chatterjee | 0.61 |
721 | 2008 | Bachna Ae Haseeno | Siddharth Anand | Ranbir Kapoor, Bipasha Basu, Deepika Padukone, Minissha Lamba | 0.61 |
455 | 1984 | Naya Kadam | K. Raghavendra Rao | Rajesh Khanna, Jaya Prada, Sridevi, Padmini Kolhapure | 0.61 |
490 | 1987 | Satyamev Jayate | Raj N. Sippy | Vinod Khanna, Meenakshi Sheshadri, Madhavi, Anita Raj | 0.61 |
#import os
#os.environ["PATH"] += os.pathsep + '/Library/TeX/texbin'
#plt.style.use(['science'])
#plt.style.use('default')
cast_importance = df.groupby('year')['female_cast_importance'].mean().rolling(window=5).mean()
female_led = df.groupby('year')['female-led'].mean().rolling(window=5).mean()
female_director = df.groupby('year')['female_director'].mean().rolling(window=5).mean()
desired_dpi = 160
fig_width, fig_height = 4, 3
plt.figure(figsize=(fig_width, fig_height), dpi=desired_dpi)
# Plot each series
plt.plot(cast_importance, label='Female Cast Importance')
plt.plot(female_led, label='Female-led Movies')
plt.plot(female_director, label='Female Director Movies')
# Set plot title and labels
#plt.title('Evolution of Female Representation in Movies')
plt.xlabel('Year')
plt.ylabel('5-year rolling averages')
# Show the legend
plt.legend()
# Display the plot
plt.show()