unir-metodologia/estudio_edad_madres_por_año_general.ipynb

229 KiB

from functools import cache
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)
estados_mexicanos = {
    "AGUASCALIENTES",
    "BAJA CALIFORNIA",
    "BAJA CALIFORNIA SUR",
    "CAMPECHE",
    "CHIAPAS",
    "CHIHUAHUA",
    "COAHUILA DE ZARAGOZA",
    "COLIMA",
    "DISTRITO FEDERAL",
    "DURANGO",
    "GUANAJUATO",
    "GUERRERO",
    "HIDALGO",
    "JALISCO",
    "MEXICO",
    "MICHOACAN DE OCAMPO",
    "MORELOS",
    "NAYARIT",
    "NUEVO LEON",
    "OAXACA",
    "PUEBLA",
    "QUERETARO  DE ARTEAGA",
    "QUINTANA ROO",
    "SAN LUIS POTOSI",
    "SINALOA",
    "SONORA",
    "TABASCO",
    "TAMAULIPAS",
    "TLAXCALA",
    "VERACRUZ DE IGNACIO DE LA LLAVE",
    "YUCATAN",
    "ZACATECAS",
}
df = pd.read_csv("2010-2019.csv")
/var/folders/05/y38rqjl55hjb_hbnypxzgrsw0000gn/T/ipykernel_93495/3168623387.py:1: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False.
  df = pd.read_csv("2010-2019.csv")
def _ano_nacimiento_vivo_func(str_date):
    try:
        return str_date.split("/")[-1]
    except:
        return ""


df["año_de_nacimiento_vivo"] = df["fecha_nacimiento_nac_vivo"].apply(
    _ano_nacimiento_vivo_func
)
df = df[(5 < df["edad_madre"]) & (df["edad_madre"] < 90)]
df_trisomias = df[df["codigo_anomalia"].apply(lambda x: "Q9" in str(x))]
consulta_trisomias = df_trisomias.groupby(["año_de_nacimiento_vivo"]).agg(
    {
        "edad_madre": [
            "count",
            "mean",
            "std",
            "min",
            "max",
            "median",
            lambda x: x.quantile(0.25),  # For Q1
            lambda x: x.quantile(0.75),  # For Q3
        ],
    }
)
consulta_trisomias = consulta_trisomias.rename(
    columns={
        "<lambda_0>": "Q1",
        "<lambda_1>": "Q3",
    }
)
consulta_trisomias
edad_madre
count mean std min max median Q1 Q3
año_de_nacimiento_vivo
2010 930 30.546237 8.244939 10 48 31.0 23.0 37.75
2011 1051 31.010466 8.193777 12 49 32.0 24.0 38.00
2012 961 30.462019 8.310565 13 47 31.0 23.0 38.00
2013 1055 31.182938 8.247919 11 51 32.0 24.0 38.00
2014 1031 31.018429 8.356304 13 50 32.0 24.0 38.00
2015 1016 31.500984 8.295052 14 52 32.0 24.0 39.00
2016 1044 31.453065 8.147413 14 47 32.0 24.0 39.00
2017 1043 31.410355 8.174581 13 47 33.0 24.0 38.50
2018 1059 31.064212 8.173198 13 48 32.0 24.0 38.00
2019 941 32.018066 8.195918 13 47 34.0 25.0 39.00
año_de_nacimiento_vivo
2010     930
2011    1051
2012     961
2013    1055
2014    1031
2015    1016
2016    1044
2017    1043
2018    1059
2019     941
Name: (edad_madre, count), dtype: int64
# Edades de madres
consulta_total = df.groupby(["año_de_nacimiento_vivo"]).agg(
    {
        "edad_madre": [
            "count",
            "mean",
            "std",
            "min",
            "max",
            "median",
            lambda x: x.quantile(0.25),  # For Q1
            lambda x: x.quantile(0.75),  # For Q3
        ],
    }
)
consulta_total = consulta_total.rename(
    columns={
        "<lambda_0>": "Q1",
        "<lambda_1>": "Q3",
    }
)
consulta_total
edad_madre
count mean std min max median Q1 Q3
año_de_nacimiento_vivo
2010 2063533 25.253220 6.319567 9 58 24.0 20.0 30.0
2011 2156751 25.234223 6.331894 9 58 24.0 20.0 30.0
2012 2197327 25.195768 6.321840 9 58 24.0 20.0 30.0
2013 2189257 25.198235 6.322081 9 59 24.0 20.0 30.0
2014 2173773 25.276009 6.322130 9 58 24.0 20.0 30.0
2015 2143345 25.367835 6.296604 9 59 25.0 20.0 30.0
2016 2079251 25.468008 6.292815 9 59 25.0 20.0 30.0
2017 2037647 25.510821 6.305873 9 62 25.0 21.0 30.0
2018 1940338 25.678051 6.328369 9 60 25.0 21.0 30.0
2019 1867693 25.840630 6.342544 9 58 25.0 21.0 30.0
consulta = consulta_total.join(
    consulta_trisomias, rsuffix="_trisomias", lsuffix="_general"
)
consulta["porcentaje"] = (
    consulta[("edad_madre_trisomias", "count")]
    / consulta[("edad_madre_general", "count")]
)
consulta
edad_madre_general edad_madre_trisomias porcentaje
count mean std min max median Q1 Q3 count mean std min max median Q1 Q3
año_de_nacimiento_vivo
2010 2063533 25.253220 6.319567 9 58 24.0 20.0 30.0 930 30.546237 8.244939 10 48 31.0 23.0 37.75 0.000451
2011 2156751 25.234223 6.331894 9 58 24.0 20.0 30.0 1051 31.010466 8.193777 12 49 32.0 24.0 38.00 0.000487
2012 2197327 25.195768 6.321840 9 58 24.0 20.0 30.0 961 30.462019 8.310565 13 47 31.0 23.0 38.00 0.000437
2013 2189257 25.198235 6.322081 9 59 24.0 20.0 30.0 1055 31.182938 8.247919 11 51 32.0 24.0 38.00 0.000482
2014 2173773 25.276009 6.322130 9 58 24.0 20.0 30.0 1031 31.018429 8.356304 13 50 32.0 24.0 38.00 0.000474
2015 2143345 25.367835 6.296604 9 59 25.0 20.0 30.0 1016 31.500984 8.295052 14 52 32.0 24.0 39.00 0.000474
2016 2079251 25.468008 6.292815 9 59 25.0 20.0 30.0 1044 31.453065 8.147413 14 47 32.0 24.0 39.00 0.000502
2017 2037647 25.510821 6.305873 9 62 25.0 21.0 30.0 1043 31.410355 8.174581 13 47 33.0 24.0 38.50 0.000512
2018 1940338 25.678051 6.328369 9 60 25.0 21.0 30.0 1059 31.064212 8.173198 13 48 32.0 24.0 38.00 0.000546
2019 1867693 25.840630 6.342544 9 58 25.0 21.0 30.0 941 32.018066 8.195918 13 47 34.0 25.0 39.00 0.000504

Pendiente

Generar gráfica de cajas con edades de las madres con hijos de trisomias.

https://stackoverflow.com/a/66565512

stats = []
for i, x in consulta_trisomias.iterrows():
    stat = dict(
        label=i,
        mean=x[("edad_madre", "mean")],
        count=x[("edad_madre", "count")],
        std=x[("edad_madre", "std")],
        whislo=x[("edad_madre", "min")],
        whishi=x[("edad_madre", "max")],
        med=x[("edad_madre", "median")],
        q1=x[("edad_madre", "Q1")],
        q3=x[("edad_madre", "Q3")],
    )
    stats.append(stat)
consulta_trisomias.describe()
edad_madre
count mean std min max median Q1 Q3
count 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000
mean 1013.100000 31.166677 8.233967 12.600000 48.600000 32.100000 23.900000 38.325000
std 49.771589 0.461478 0.068843 1.264911 1.837873 0.875595 0.567646 0.500694
min 930.000000 30.462019 8.147413 10.000000 47.000000 31.000000 23.000000 37.750000
25% 974.750000 31.012457 8.179380 12.250000 47.000000 32.000000 24.000000 38.000000
50% 1037.000000 31.123575 8.220429 13.000000 48.000000 32.000000 24.000000 38.000000
75% 1049.250000 31.442388 8.283268 13.000000 49.750000 32.000000 24.000000 38.875000
max 1059.000000 32.018066 8.356304 14.000000 52.000000 34.000000 25.000000 39.000000
%matplotlib notebook
from matplotlib.figure import Figure
from matplotlib.ticker import AutoMinorLocator

fig = Figure()
ax = fig.add_subplot()
ax.bxp(
    stats,
    showfliers=False,
    showmeans=True,
)
ax.set_ylabel("Edad de la Madre")
ax.set_xlabel("Año de Registro")
ax.set_title("Distribuciones de Edad de las Madres")
ax.yaxis.set_minor_locator(AutoMinorLocator())

ax.grid(visible=True, which="both", axis="y", linewidth=1, alpha=0.2)
fig

def _anomalias_filtradas(anomalia):
    splitted = [x for x in anomalia.split(",") if len(x) == 4 and x[:2] == "Q9"]
    if len(splitted) < 2:
        return ",".join(splitted)
    if splitted[0] == splitted[1]:
        return splitted[0]
    return ",".join(splitted)


def _clasificador(codigos):

    if "Q910" in codigos or "Q911" in codigos or "Q912" in codigos or "Q913" in codigos:
        return "Edwards"
    if "Q914" in codigos or "Q915" in codigos or "Q916" in codigos or "Q917" in codigos:
        return "Patau"
    if "Q90" in codigos:
        return "Down"
    return "Otra"


df_trisomias["Trisomía"] = df_trisomias.codigo_anomalia.apply(
    _anomalias_filtradas
).apply(_clasificador)
df_trisomias["Síndrome de Down"] = df_trisomias["Trisomía"] == "Down"
df_trisomias["Síndrome de Edwards"] = df_trisomias["Trisomía"] == "Edwards"
df_trisomias["Síndrome de Patau"] = df_trisomias["Trisomía"] == "Patau"
df_trisomias["Otro Síndrome"] = df_trisomias["Trisomía"] == "Otra"
/var/folders/05/y38rqjl55hjb_hbnypxzgrsw0000gn/T/ipykernel_93495/1078194369.py:30: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trisomias["Trisomía"] = df_trisomias.codigo_anomalia.apply(
/var/folders/05/y38rqjl55hjb_hbnypxzgrsw0000gn/T/ipykernel_93495/1078194369.py:33: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trisomias["Síndrome de Down"] = (df_trisomias["Trisomía"] == "Down")
/var/folders/05/y38rqjl55hjb_hbnypxzgrsw0000gn/T/ipykernel_93495/1078194369.py:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trisomias["Síndrome de Edwards"] = (df_trisomias["Trisomía"] == "Edwards")
/var/folders/05/y38rqjl55hjb_hbnypxzgrsw0000gn/T/ipykernel_93495/1078194369.py:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trisomias["Síndrome de Patau"] = (df_trisomias["Trisomía"] == "Patau")
/var/folders/05/y38rqjl55hjb_hbnypxzgrsw0000gn/T/ipykernel_93495/1078194369.py:36: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trisomias["Otro Síndrome"] =  (df_trisomias["Trisomía"] == "Otra")
df_trisomias.columns
Index(['Unnamed: 0.1', 'Unnamed: 0', 'edo_captura', 'edo_nac_madre',
       'fecha_nac_madre', 'edad_madre', 'estado_conyugal',
       'entidad_residencia_madre', 'numero_embarazos', 'hijos_nacidos_muertos',
       'hijos_nacidos_vivos', 'hijos_sobrevivientes', 'el_hijo_anterior_nacio',
       'vive_aun_hijo_anterior', 'orden_nacimiento',
       'recibio_atencion_prenatal', 'trimestre_recibio_primera_consulta',
       'total_consultas_recibidas', 'madre_sobrevivio_al_parto',
       'escolaridad_madre', 'ocupacion_habitual_madre', 'trabaja_actualmente',
       'fecha_nacimiento_nac_vivo', 'hora_nacimiento_nac_vivo',
       'sexo_nac_vivo', 'semanas_gestacion_nac_vivo', 'talla_nac_vivo',
       'peso_nac_vivo', 'valoracion_apgar_nac_vivo',
       'valoracion_silverman_nac_vivo', 'producto_de_un_embarazo',
       'codigo_anomalia', 'entidad_certifico', 'año_de_nacimiento_vivo',
       'Trisomia', 'Sindrome de Down', 'Sindrome de Edwards',
       'Sindrome de Patau', 'Otro Sindrome', 'Trisomía', 'Síndrome de Down',
       'Síndrome de Edwards', 'Síndrome de Patau', 'Otro Síndrome'],
      dtype='object')
import matplotlib.ticker as mtick

fig = Figure()
ax = fig.add_subplot()
_to_plot = df_trisomias.groupby("año_de_nacimiento_vivo").agg(
    {
        "Síndrome de Down": ["sum"],
        "Síndrome de Edwards": ["sum"],
        "Síndrome de Patau": ["sum"],
        "Otro Síndrome": ["sum"],
    }
)
_index = _to_plot.index.to_list()
_records = _to_plot.to_dict("records")
_labels = [x[0] for x in _records[0].keys()]
_data = np.array([[*x.values()] for x in _records], dtype="float64")
_totals = (_data @ np.ones(_data.shape[1])) / 100
bottom = _data[:, 0] * 0
for i, label in enumerate(_labels):
    data = _data[:, i] / _totals
    ax.bar(_index, data, 0.7, label=label, bottom=bottom)
    bottom += data
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_title("Distribución de trisomias y otros síndromes")
ax.legend()
fig

_to_plot = df_trisomias.groupby("año_de_nacimiento_vivo").agg(
    {
        "Síndrome de Down": ["sum"],
        "Síndrome de Edwards": ["sum"],
        "Síndrome de Patau": ["sum"],
        "Otro Síndrome": ["sum"],
    }
)
_to_plot.columns = ["Down", "Edwards", "Patau", "Otros"]
_to_plot["Total"] = _to_plot.apply(sum, axis=1)
for col in ["Down", "Edwards", "Patau", "Otros"]:
    _to_plot[col] = _to_plot[col] / _to_plot["Total"]
_to_plot.describe()
Down Edwards Patau Otros Total
count 10.000000 10.000000 10.000000 10.000000 10.000000
mean 0.846123 0.020646 0.007997 0.125234 1013.100000
std 0.040421 0.006969 0.003110 0.038154 49.771589
min 0.797699 0.010627 0.004162 0.065156 930.000000
25% 0.813188 0.016582 0.005697 0.105395 974.750000
50% 0.846609 0.019526 0.007144 0.126721 1037.000000
75% 0.861515 0.023294 0.010124 0.154212 1049.250000
max 0.912181 0.036433 0.012752 0.180645 1059.000000
fig = Figure()
ax = fig.add_subplot()
_x = [int(x) for x in (consulta.index.to_list())]
_z = consulta[("edad_madre_trisomias", "count")].to_list()
_y = (consulta.porcentaje * 100).to_list()
ax.plot(_x, _y, "--")
ax.scatter(_x, _y, label="Porcentaje y cantidad de nacidos con malformación")

ax.annotate(f"{int(_z[0])}", (_x[0] - 0.075, _y[0] - 0.001))
ax.annotate(f"{int(_z[1])}", (_x[1] - 0.075, _y[1] + 0.0005))
ax.annotate(f"{int(_z[2])}", (_x[2] - 0.075, _y[2] - 0.001))
ax.annotate(f"{int(_z[3])}", (_x[3] - 0.075, _y[3] + 0.0005))
ax.annotate(f"{int(_z[4])}", (_x[4] - 0.075, _y[4] - 0.001))
ax.annotate(f"{int(_z[5])}", (_x[5] - 0.075, _y[5] - 0.001))
ax.annotate(f"{int(_z[6])}", (_x[6] - 0.075, _y[6] - 0.001))
ax.annotate(f"{int(_z[7])}", (_x[7] - 0.075, _y[7] - 0.001))
ax.annotate(f"{int(_z[8])}", (_x[8] + 0.2, _y[8] - 0.00025))
ax.annotate(f"{int(_z[9])}", (_x[9] - 0.075, _y[9] - 0.001))

ax.set_ylim(0.04, 0.055)
ax.xaxis.set_ticks(_x)
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_title("Distribución de nacimiento con malformación")
ax.set_xlabel("Año")
ax.set_ylabel("Porcentaje")
ax.yaxis.set_minor_locator(AutoMinorLocator())
ax.grid(alpha=0.1, which="both")
ax.legend()
fig

consulta.porcentaje.describe()
count    10.000000
mean      0.000487
std       0.000031
min       0.000437
25%       0.000474
50%       0.000485
75%       0.000503
max       0.000546
Name: porcentaje, dtype: float64
stats = []
for i, x in consulta_total.iterrows():
    stat = dict(
        label=i,
        mean=x[("edad_madre", "mean")],
        count=x[("edad_madre", "count")],
        std=x[("edad_madre", "std")],
        whislo=x[("edad_madre", "min")],
        whishi=x[("edad_madre", "max")],
        med=x[("edad_madre", "median")],
        q1=x[("edad_madre", "Q1")],
        q3=x[("edad_madre", "Q3")],
    )
    stats.append(stat)
%matplotlib notebook
from matplotlib.figure import Figure
from matplotlib.ticker import AutoMinorLocator

fig = Figure()
ax = fig.add_subplot()
ax.bxp(
    stats,
    showfliers=False,
    showmeans=True,
)
ax.set_ylabel("Edad de la Madre")
ax.set_xlabel("Año de Registro")
ax.set_title("Distribuciones Poblacionales de Edad de las Madres")
ax.yaxis.set_minor_locator(AutoMinorLocator())

ax.grid(visible=True, which="both", axis="y", linewidth=1, alpha=0.2)
fig

consulta_total.describe()
edad_madre
count mean std min max median Q1 Q3
count 1.000000e+01 10.000000 10.000000 10.0 10.000000 10.000000 10.000000 10.0
mean 2.084892e+06 25.402280 6.318372 9.0 58.900000 24.500000 20.300000 30.0
std 1.109581e+05 0.220145 0.015605 0.0 1.286684 0.527046 0.483046 0.0
min 1.867693e+06 25.195768 6.292815 9.0 58.000000 24.000000 20.000000 30.0
25% 2.044118e+06 25.238972 6.309296 9.0 58.000000 24.000000 20.000000 30.0
50% 2.111298e+06 25.321922 6.321961 9.0 58.500000 24.500000 20.000000 30.0
75% 2.169518e+06 25.500118 6.326809 9.0 59.000000 25.000000 20.750000 30.0
max 2.197327e+06 25.840630 6.342544 9.0 62.000000 25.000000 21.000000 30.0