1515import os
1616import struct
1717import sys
18- from typing import Any
18+ from typing import Any , Dict , Hashable , Optional , Sequence
1919import warnings
2020
2121from dateutil .relativedelta import relativedelta
2222import numpy as np
2323
2424from pandas ._libs .lib import infer_dtype
2525from pandas ._libs .writers import max_len_string_array
26+ from pandas ._typing import FilePathOrBuffer
2627from pandas .util ._decorators import Appender
2728
2829from pandas .core .dtypes .common import (
4748from pandas .io .common import get_filepath_or_buffer , stringify_path
4849
4950_version_error = (
50- "Version of given Stata file is not 104, 105, 108, "
51- "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
52- "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)"
51+ "Version of given Stata file is {version}. pandas supports importing "
52+ "versions 104, 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), "
53+ "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),"
54+ "and 119 (Stata 15/16, over 32,767 variables)."
5355)
5456
5557_statafile_processing_params1 = """\
@@ -1090,11 +1092,11 @@ def _read_header(self):
10901092 self .col_sizes = [self ._calcsize (typ ) for typ in self .typlist ]
10911093
10921094 def _read_new_header (self , first_char ):
1093- # The first part of the header is common to 117 and 118 .
1095+ # The first part of the header is common to 117 - 119 .
10941096 self .path_or_buf .read (27 ) # stata_dta><header><release>
10951097 self .format_version = int (self .path_or_buf .read (3 ))
10961098 if self .format_version not in [117 , 118 , 119 ]:
1097- raise ValueError (_version_error )
1099+ raise ValueError (_version_error . format ( version = self . format_version ) )
10981100 self ._set_encoding ()
10991101 self .path_or_buf .read (21 ) # </release><byteorder>
11001102 self .byteorder = self .path_or_buf .read (3 ) == b"MSF" and ">" or "<"
@@ -1287,7 +1289,7 @@ def _get_seek_variable_labels(self):
12871289 def _read_old_header (self , first_char ):
12881290 self .format_version = struct .unpack ("b" , first_char )[0 ]
12891291 if self .format_version not in [104 , 105 , 108 , 111 , 113 , 114 , 115 ]:
1290- raise ValueError (_version_error )
1292+ raise ValueError (_version_error . format ( version = self . format_version ) )
12911293 self ._set_encoding ()
12921294 self .byteorder = (
12931295 struct .unpack ("b" , self .path_or_buf .read (1 ))[0 ] == 0x1 and ">" or "<"
@@ -2695,7 +2697,7 @@ def _convert_key(self, key):
26952697
26962698 def generate_table (self ):
26972699 """
2698- Generates the GSO lookup table for the DataFRame
2700+ Generates the GSO lookup table for the DataFrame
26992701
27002702 Returns
27012703 -------
@@ -2934,9 +2936,9 @@ def _write_header(self, data_label=None, time_stamp=None):
29342936 bio .write (self ._tag (bytes (str (self ._dta_version ), "utf-8" ), "release" ))
29352937 # byteorder
29362938 bio .write (self ._tag (byteorder == ">" and "MSF" or "LSF" , "byteorder" ))
2937- # number of vars, 2 bytes
2938- assert self .nvar < 2 ** 16
2939- bio .write (self ._tag (struct .pack (byteorder + "H" , self .nvar ), "K" ))
2939+ # number of vars, 2 bytes in 117 and 118, 4 byte in 119
2940+ nvar_type = "H" if self ._dta_version <= 118 else "I"
2941+ bio .write (self ._tag (struct .pack (byteorder + nvar_type , self .nvar ), "K" ))
29402942 # 117 uses 4 bytes, 118 uses 8
29412943 nobs_size = "I" if self ._dta_version == 117 else "Q"
29422944 bio .write (self ._tag (struct .pack (byteorder + nobs_size , self .nobs ), "N" ))
@@ -3033,7 +3035,8 @@ def _write_varnames(self):
30333035
30343036 def _write_sortlist (self ):
30353037 self ._update_map ("sortlist" )
3036- self ._file .write (self ._tag (b"\x00 \00 " * (self .nvar + 1 ), "sortlist" ))
3038+ sort_size = 2 if self ._dta_version < 119 else 4
3039+ self ._file .write (self ._tag (b"\x00 " * sort_size * (self .nvar + 1 ), "sortlist" ))
30373040
30383041 def _write_formats (self ):
30393042 self ._update_map ("formats" )
@@ -3173,13 +3176,14 @@ def _set_formats_and_types(self, dtypes):
31733176 )
31743177
31753178
3176- class StataWriter118 (StataWriter117 ):
3179+ class StataWriterUTF8 (StataWriter117 ):
31773180 """
3178- A class for writing Stata binary dta files in Stata 15 format (118)
3181+ Stata binary dta file writing in Stata 15 (118) and 16 (119) formats
31793182
3180- DTA 118 format files support unicode string data (both fixed and strL)
3181- format. Unicode is also supported in value labels, variable labels and
3182- the dataset label.
3183+ DTA 118 and 119 format files support unicode string data (both fixed
3184+ and strL) format. Unicode is also supported in value labels, variable
3185+ labels and the dataset label. Format 119 is automatically used if the
3186+ file contains more than 32,767 variables.
31833187
31843188 .. versionadded:: 1.0.0
31853189
@@ -3192,34 +3196,38 @@ class StataWriter118(StataWriter117):
31923196 is written.
31933197 data : DataFrame
31943198 Input to save
3195- convert_dates : dict
3199+ convert_dates : dict, default None
31963200 Dictionary mapping columns containing datetime types to stata internal
31973201 format to use when writing the dates. Options are 'tc', 'td', 'tm',
31983202 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
31993203 Datetime columns that do not have a conversion type specified will be
32003204 converted to 'tc'. Raises NotImplementedError if a datetime column has
32013205 timezone information
3202- write_index : bool
3206+ write_index : bool, default True
32033207 Write the index to Stata dataset.
3204- byteorder : str
3208+ byteorder : str, default None
32053209 Can be ">", "<", "little", or "big". default is `sys.byteorder`
3206- time_stamp : datetime
3210+ time_stamp : datetime, default None
32073211 A datetime to use as file creation date. Default is the current time
3208- data_label : str
3212+ data_label : str, default None
32093213 A label for the data set. Must be 80 characters or smaller.
3210- variable_labels : dict
3214+ variable_labels : dict, default None
32113215 Dictionary containing columns as keys and variable labels as values.
32123216 Each label must be 80 characters or smaller.
3213- convert_strl : list
3217+ convert_strl : list, default None
32143218 List of columns names to convert to Stata StrL format. Columns with
32153219 more than 2045 characters are automatically written as StrL.
32163220 Smaller columns can be converted by including the column name. Using
32173221 StrLs can reduce output file size when strings are longer than 8
32183222 characters, and either frequently repeated or sparse.
3223+ version : int, default None
3224+ The dta version to use. By default, uses the size of data to determine
3225+ the version. 118 is used if data.shape[1] <= 32767, and 119 is used
3226+ for storing larger DataFrames.
32193227
32203228 Returns
32213229 -------
3222- StataWriter118
3230+ StataWriterUTF8
32233231 The instance has a write_file method, which will write the file to the
32243232 given `fname`.
32253233
@@ -3238,24 +3246,60 @@ class StataWriter118(StataWriter117):
32383246 --------
32393247 Using Unicode data and column names
32403248
3241- >>> from pandas.io.stata import StataWriter118
3249+ >>> from pandas.io.stata import StataWriterUTF8
32423250 >>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ'])
3243- >>> writer = StataWriter118 ('./data_file.dta', data)
3251+ >>> writer = StataWriterUTF8 ('./data_file.dta', data)
32443252 >>> writer.write_file()
32453253
32463254 Or with long strings stored in strl format
32473255
32483256 >>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']],
32493257 ... columns=['strls'])
3250- >>> writer = StataWriter118 ('./data_file_with_long_strings.dta', data,
3251- ... convert_strl=['strls'])
3258+ >>> writer = StataWriterUTF8 ('./data_file_with_long_strings.dta', data,
3259+ ... convert_strl=['strls'])
32523260 >>> writer.write_file()
32533261 """
32543262
32553263 _encoding = "utf-8"
3256- _dta_version = 118
32573264
3258- def _validate_variable_name (self , name ):
3265+ def __init__ (
3266+ self ,
3267+ fname : FilePathOrBuffer ,
3268+ data : DataFrame ,
3269+ convert_dates : Optional [Dict [Hashable , str ]] = None ,
3270+ write_index : bool = True ,
3271+ byteorder : Optional [str ] = None ,
3272+ time_stamp : Optional [datetime .datetime ] = None ,
3273+ data_label : Optional [str ] = None ,
3274+ variable_labels : Optional [Dict [Hashable , str ]] = None ,
3275+ convert_strl : Optional [Sequence [Hashable ]] = None ,
3276+ version : Optional [int ] = None ,
3277+ ):
3278+ if version is None :
3279+ version = 118 if data .shape [1 ] <= 32767 else 119
3280+ elif version not in (118 , 119 ):
3281+ raise ValueError ("version must be either 118 or 119." )
3282+ elif version == 118 and data .shape [1 ] > 32767 :
3283+ raise ValueError (
3284+ "You must use version 119 for data sets containing more than"
3285+ "32,767 variables"
3286+ )
3287+
3288+ super ().__init__ (
3289+ fname ,
3290+ data ,
3291+ convert_dates = convert_dates ,
3292+ write_index = write_index ,
3293+ byteorder = byteorder ,
3294+ time_stamp = time_stamp ,
3295+ data_label = data_label ,
3296+ variable_labels = variable_labels ,
3297+ convert_strl = convert_strl ,
3298+ )
3299+ # Override version set in StataWriter117 init
3300+ self ._dta_version = version
3301+
3302+ def _validate_variable_name (self , name : str ) -> str :
32593303 """
32603304 Validate variable names for Stata export.
32613305
@@ -3272,7 +3316,7 @@ def _validate_variable_name(self, name):
32723316
32733317 Notes
32743318 -----
3275- Stata 118 support most unicode characters. The only limatation is in
3319+ Stata 118+ support most unicode characters. The only limitation is in
32763320 the ascii range where the characters supported are a-z, A-Z, 0-9 and _.
32773321 """
32783322 # High code points appear to be acceptable
0 commit comments