sqlglot.dialects.bigquery
1from __future__ import annotations 2 3import logging 4import re 5import typing as t 6 7 8from sqlglot.optimizer.annotate_types import TypeAnnotator 9 10from sqlglot import exp, generator, jsonpath, parser, tokens, transforms 11from sqlglot._typing import E 12from sqlglot.dialects.dialect import ( 13 Dialect, 14 NormalizationStrategy, 15 annotate_with_type_lambda, 16 arg_max_or_min_no_count, 17 binary_from_function, 18 date_add_interval_sql, 19 datestrtodate_sql, 20 build_formatted_time, 21 filter_array_using_unnest, 22 if_sql, 23 inline_array_unless_query, 24 max_or_greatest, 25 min_or_least, 26 no_ilike_sql, 27 build_date_delta_with_interval, 28 regexp_replace_sql, 29 rename_func, 30 sha256_sql, 31 timestrtotime_sql, 32 ts_or_ds_add_cast, 33 unit_to_var, 34 strposition_sql, 35 groupconcat_sql, 36) 37from sqlglot.helper import seq_get, split_num_words 38from sqlglot.tokens import TokenType 39from sqlglot.generator import unsupported_args 40 41if t.TYPE_CHECKING: 42 from sqlglot._typing import Lit 43 44 from sqlglot.optimizer.annotate_types import TypeAnnotator 45 46logger = logging.getLogger("sqlglot") 47 48 49JSON_EXTRACT_TYPE = t.Union[exp.JSONExtract, exp.JSONExtractScalar, exp.JSONExtractArray] 50 51DQUOTES_ESCAPING_JSON_FUNCTIONS = ("JSON_QUERY", "JSON_VALUE", "JSON_QUERY_ARRAY") 52 53 54def _derived_table_values_to_unnest(self: BigQuery.Generator, expression: exp.Values) -> str: 55 if not expression.find_ancestor(exp.From, exp.Join): 56 return self.values_sql(expression) 57 58 structs = [] 59 alias = expression.args.get("alias") 60 for tup in expression.find_all(exp.Tuple): 61 field_aliases = ( 62 alias.columns 63 if alias and alias.columns 64 else (f"_c{i}" for i in range(len(tup.expressions))) 65 ) 66 expressions = [ 67 exp.PropertyEQ(this=exp.to_identifier(name), expression=fld) 68 for name, fld in zip(field_aliases, tup.expressions) 69 ] 70 structs.append(exp.Struct(expressions=expressions)) 71 72 # Due to `UNNEST_COLUMN_ONLY`, it is expected that the table alias be contained in the columns expression 73 alias_name_only = exp.TableAlias(columns=[alias.this]) if alias else None 74 return self.unnest_sql( 75 exp.Unnest(expressions=[exp.array(*structs, copy=False)], alias=alias_name_only) 76 ) 77 78 79def _returnsproperty_sql(self: BigQuery.Generator, expression: exp.ReturnsProperty) -> str: 80 this = expression.this 81 if isinstance(this, exp.Schema): 82 this = f"{self.sql(this, 'this')} <{self.expressions(this)}>" 83 else: 84 this = self.sql(this) 85 return f"RETURNS {this}" 86 87 88def _create_sql(self: BigQuery.Generator, expression: exp.Create) -> str: 89 returns = expression.find(exp.ReturnsProperty) 90 if expression.kind == "FUNCTION" and returns and returns.args.get("is_table"): 91 expression.set("kind", "TABLE FUNCTION") 92 93 if isinstance(expression.expression, (exp.Subquery, exp.Literal)): 94 expression.set("expression", expression.expression.this) 95 96 return self.create_sql(expression) 97 98 99# https://issuetracker.google.com/issues/162294746 100# workaround for bigquery bug when grouping by an expression and then ordering 101# WITH x AS (SELECT 1 y) 102# SELECT y + 1 z 103# FROM x 104# GROUP BY x + 1 105# ORDER by z 106def _alias_ordered_group(expression: exp.Expression) -> exp.Expression: 107 if isinstance(expression, exp.Select): 108 group = expression.args.get("group") 109 order = expression.args.get("order") 110 111 if group and order: 112 aliases = { 113 select.this: select.args["alias"] 114 for select in expression.selects 115 if isinstance(select, exp.Alias) 116 } 117 118 for grouped in group.expressions: 119 if grouped.is_int: 120 continue 121 alias = aliases.get(grouped) 122 if alias: 123 grouped.replace(exp.column(alias)) 124 125 return expression 126 127 128def _pushdown_cte_column_names(expression: exp.Expression) -> exp.Expression: 129 """BigQuery doesn't allow column names when defining a CTE, so we try to push them down.""" 130 if isinstance(expression, exp.CTE) and expression.alias_column_names: 131 cte_query = expression.this 132 133 if cte_query.is_star: 134 logger.warning( 135 "Can't push down CTE column names for star queries. Run the query through" 136 " the optimizer or use 'qualify' to expand the star projections first." 137 ) 138 return expression 139 140 column_names = expression.alias_column_names 141 expression.args["alias"].set("columns", None) 142 143 for name, select in zip(column_names, cte_query.selects): 144 to_replace = select 145 146 if isinstance(select, exp.Alias): 147 select = select.this 148 149 # Inner aliases are shadowed by the CTE column names 150 to_replace.replace(exp.alias_(select, name)) 151 152 return expression 153 154 155def _build_parse_timestamp(args: t.List) -> exp.StrToTime: 156 this = build_formatted_time(exp.StrToTime, "bigquery")([seq_get(args, 1), seq_get(args, 0)]) 157 this.set("zone", seq_get(args, 2)) 158 return this 159 160 161def _build_timestamp(args: t.List) -> exp.Timestamp: 162 timestamp = exp.Timestamp.from_arg_list(args) 163 timestamp.set("with_tz", True) 164 return timestamp 165 166 167def _build_date(args: t.List) -> exp.Date | exp.DateFromParts: 168 expr_type = exp.DateFromParts if len(args) == 3 else exp.Date 169 return expr_type.from_arg_list(args) 170 171 172def _build_to_hex(args: t.List) -> exp.Hex | exp.MD5: 173 # TO_HEX(MD5(..)) is common in BigQuery, so it's parsed into MD5 to simplify its transpilation 174 arg = seq_get(args, 0) 175 return exp.MD5(this=arg.this) if isinstance(arg, exp.MD5Digest) else exp.LowerHex(this=arg) 176 177 178def _build_json_strip_nulls(args: t.List) -> exp.JSONStripNulls: 179 expression = exp.JSONStripNulls(this=seq_get(args, 0)) 180 181 for arg in args[1:]: 182 if isinstance(arg, exp.Kwarg): 183 expression.set(arg.this.name.lower(), arg) 184 else: 185 expression.set("expression", arg) 186 187 return expression 188 189 190def _array_contains_sql(self: BigQuery.Generator, expression: exp.ArrayContains) -> str: 191 return self.sql( 192 exp.Exists( 193 this=exp.select("1") 194 .from_(exp.Unnest(expressions=[expression.left]).as_("_unnest", table=["_col"])) 195 .where(exp.column("_col").eq(expression.right)) 196 ) 197 ) 198 199 200def _ts_or_ds_add_sql(self: BigQuery.Generator, expression: exp.TsOrDsAdd) -> str: 201 return date_add_interval_sql("DATE", "ADD")(self, ts_or_ds_add_cast(expression)) 202 203 204def _ts_or_ds_diff_sql(self: BigQuery.Generator, expression: exp.TsOrDsDiff) -> str: 205 expression.this.replace(exp.cast(expression.this, exp.DataType.Type.TIMESTAMP)) 206 expression.expression.replace(exp.cast(expression.expression, exp.DataType.Type.TIMESTAMP)) 207 unit = unit_to_var(expression) 208 return self.func("DATE_DIFF", expression.this, expression.expression, unit) 209 210 211def _unix_to_time_sql(self: BigQuery.Generator, expression: exp.UnixToTime) -> str: 212 scale = expression.args.get("scale") 213 timestamp = expression.this 214 215 if scale in (None, exp.UnixToTime.SECONDS): 216 return self.func("TIMESTAMP_SECONDS", timestamp) 217 if scale == exp.UnixToTime.MILLIS: 218 return self.func("TIMESTAMP_MILLIS", timestamp) 219 if scale == exp.UnixToTime.MICROS: 220 return self.func("TIMESTAMP_MICROS", timestamp) 221 222 unix_seconds = exp.cast( 223 exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)), exp.DataType.Type.BIGINT 224 ) 225 return self.func("TIMESTAMP_SECONDS", unix_seconds) 226 227 228def _build_time(args: t.List) -> exp.Func: 229 if len(args) == 1: 230 return exp.TsOrDsToTime(this=args[0]) 231 if len(args) == 2: 232 return exp.Time.from_arg_list(args) 233 return exp.TimeFromParts.from_arg_list(args) 234 235 236def _build_datetime(args: t.List) -> exp.Func: 237 if len(args) == 1: 238 return exp.TsOrDsToDatetime.from_arg_list(args) 239 if len(args) == 2: 240 return exp.Datetime.from_arg_list(args) 241 return exp.TimestampFromParts.from_arg_list(args) 242 243 244def _build_regexp_extract( 245 expr_type: t.Type[E], default_group: t.Optional[exp.Expression] = None 246) -> t.Callable[[t.List], E]: 247 def _builder(args: t.List) -> E: 248 try: 249 group = re.compile(args[1].name).groups == 1 250 except re.error: 251 group = False 252 253 # Default group is used for the transpilation of REGEXP_EXTRACT_ALL 254 return expr_type( 255 this=seq_get(args, 0), 256 expression=seq_get(args, 1), 257 position=seq_get(args, 2), 258 occurrence=seq_get(args, 3), 259 group=exp.Literal.number(1) if group else default_group, 260 ) 261 262 return _builder 263 264 265def _build_extract_json_with_default_path(expr_type: t.Type[E]) -> t.Callable[[t.List, Dialect], E]: 266 def _builder(args: t.List, dialect: Dialect) -> E: 267 if len(args) == 1: 268 # The default value for the JSONPath is '$' i.e all of the data 269 args.append(exp.Literal.string("$")) 270 return parser.build_extract_json_with_path(expr_type)(args, dialect) 271 272 return _builder 273 274 275def _str_to_datetime_sql( 276 self: BigQuery.Generator, expression: exp.StrToDate | exp.StrToTime 277) -> str: 278 this = self.sql(expression, "this") 279 dtype = "DATE" if isinstance(expression, exp.StrToDate) else "TIMESTAMP" 280 281 if expression.args.get("safe"): 282 fmt = self.format_time( 283 expression, 284 self.dialect.INVERSE_FORMAT_MAPPING, 285 self.dialect.INVERSE_FORMAT_TRIE, 286 ) 287 return f"SAFE_CAST({this} AS {dtype} FORMAT {fmt})" 288 289 fmt = self.format_time(expression) 290 return self.func(f"PARSE_{dtype}", fmt, this, expression.args.get("zone")) 291 292 293def _annotate_math_functions(self: TypeAnnotator, expression: E) -> E: 294 """ 295 Many BigQuery math functions such as CEIL, FLOOR etc follow this return type convention: 296 +---------+---------+---------+------------+---------+ 297 | INPUT | INT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 298 +---------+---------+---------+------------+---------+ 299 | OUTPUT | FLOAT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 300 +---------+---------+---------+------------+---------+ 301 """ 302 self._annotate_args(expression) 303 304 this: exp.Expression = expression.this 305 306 self._set_type( 307 expression, 308 exp.DataType.Type.DOUBLE if this.is_type(*exp.DataType.INTEGER_TYPES) else this.type, 309 ) 310 return expression 311 312 313def _annotate_by_args_with_coerce(self: TypeAnnotator, expression: E) -> E: 314 """ 315 +------------+------------+------------+-------------+---------+ 316 | INPUT | INT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 317 +------------+------------+------------+-------------+---------+ 318 | INT64 | INT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 319 | NUMERIC | NUMERIC | NUMERIC | BIGNUMERIC | FLOAT64 | 320 | BIGNUMERIC | BIGNUMERIC | BIGNUMERIC | BIGNUMERIC | FLOAT64 | 321 | FLOAT64 | FLOAT64 | FLOAT64 | FLOAT64 | FLOAT64 | 322 +------------+------------+------------+-------------+---------+ 323 """ 324 self._annotate_args(expression) 325 326 self._set_type(expression, self._maybe_coerce(expression.this.type, expression.expression.type)) 327 return expression 328 329 330def _annotate_by_args_approx_top(self: TypeAnnotator, expression: exp.ApproxTopK) -> exp.ApproxTopK: 331 self._annotate_args(expression) 332 333 struct_type = exp.DataType( 334 this=exp.DataType.Type.STRUCT, 335 expressions=[expression.this.type, exp.DataType(this=exp.DataType.Type.BIGINT)], 336 nested=True, 337 ) 338 self._set_type( 339 expression, 340 exp.DataType(this=exp.DataType.Type.ARRAY, expressions=[struct_type], nested=True), 341 ) 342 343 return expression 344 345 346@unsupported_args("ins_cost", "del_cost", "sub_cost") 347def _levenshtein_sql(self: BigQuery.Generator, expression: exp.Levenshtein) -> str: 348 max_dist = expression.args.get("max_dist") 349 if max_dist: 350 max_dist = exp.Kwarg(this=exp.var("max_distance"), expression=max_dist) 351 352 return self.func("EDIT_DISTANCE", expression.this, expression.expression, max_dist) 353 354 355def _build_levenshtein(args: t.List) -> exp.Levenshtein: 356 max_dist = seq_get(args, 2) 357 return exp.Levenshtein( 358 this=seq_get(args, 0), 359 expression=seq_get(args, 1), 360 max_dist=max_dist.expression if max_dist else None, 361 ) 362 363 364def _build_format_time(expr_type: t.Type[exp.Expression]) -> t.Callable[[t.List], exp.TimeToStr]: 365 def _builder(args: t.List) -> exp.TimeToStr: 366 return exp.TimeToStr( 367 this=expr_type(this=seq_get(args, 1)), 368 format=seq_get(args, 0), 369 zone=seq_get(args, 2), 370 ) 371 372 return _builder 373 374 375def _build_contains_substring(args: t.List) -> exp.Contains: 376 # Lowercase the operands in case of transpilation, as exp.Contains 377 # is case-sensitive on other dialects 378 this = exp.Lower(this=seq_get(args, 0)) 379 expr = exp.Lower(this=seq_get(args, 1)) 380 381 return exp.Contains(this=this, expression=expr, json_scope=seq_get(args, 2)) 382 383 384def _json_extract_sql(self: BigQuery.Generator, expression: JSON_EXTRACT_TYPE) -> str: 385 name = (expression._meta and expression.meta.get("name")) or expression.sql_name() 386 upper = name.upper() 387 388 dquote_escaping = upper in DQUOTES_ESCAPING_JSON_FUNCTIONS 389 390 if dquote_escaping: 391 self._quote_json_path_key_using_brackets = False 392 393 sql = rename_func(upper)(self, expression) 394 395 if dquote_escaping: 396 self._quote_json_path_key_using_brackets = True 397 398 return sql 399 400 401def _annotate_concat(self: TypeAnnotator, expression: exp.Concat) -> exp.Concat: 402 annotated = self._annotate_by_args(expression, "expressions") 403 404 # Args must be BYTES or types that can be cast to STRING, return type is either BYTES or STRING 405 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#concat 406 if not annotated.is_type(exp.DataType.Type.BINARY, exp.DataType.Type.UNKNOWN): 407 annotated.type = exp.DataType.Type.VARCHAR 408 409 return annotated 410 411 412def _annotate_array(self: TypeAnnotator, expression: exp.Array) -> exp.Array: 413 array_args = expression.expressions 414 415 # BigQuery behaves as follows: 416 # 417 # SELECT t, TYPEOF(t) FROM (SELECT 'foo') AS t -- foo, STRUCT<STRING> 418 # SELECT ARRAY(SELECT 'foo'), TYPEOF(ARRAY(SELECT 'foo')) -- foo, ARRAY<STRING> 419 if ( 420 len(array_args) == 1 421 and isinstance(select := array_args[0].unnest(), exp.Select) 422 and (query_type := select.meta.get("query_type")) is not None 423 and query_type.is_type(exp.DataType.Type.STRUCT) 424 and len(query_type.expressions) == 1 425 and isinstance(col_def := query_type.expressions[0], exp.ColumnDef) 426 and (projection_type := col_def.kind) is not None 427 and not projection_type.is_type(exp.DataType.Type.UNKNOWN) 428 ): 429 array_type = exp.DataType( 430 this=exp.DataType.Type.ARRAY, 431 expressions=[projection_type.copy()], 432 nested=True, 433 ) 434 return self._annotate_with_type(expression, array_type) 435 436 return self._annotate_by_args(expression, "expressions", array=True) 437 438 439class BigQuery(Dialect): 440 WEEK_OFFSET = -1 441 UNNEST_COLUMN_ONLY = True 442 SUPPORTS_USER_DEFINED_TYPES = False 443 SUPPORTS_SEMI_ANTI_JOIN = False 444 LOG_BASE_FIRST = False 445 HEX_LOWERCASE = True 446 FORCE_EARLY_ALIAS_REF_EXPANSION = True 447 PRESERVE_ORIGINAL_NAMES = True 448 HEX_STRING_IS_INTEGER_TYPE = True 449 450 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 451 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 452 453 # bigquery udfs are case sensitive 454 NORMALIZE_FUNCTIONS = False 455 456 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 457 TIME_MAPPING = { 458 "%D": "%m/%d/%y", 459 "%E6S": "%S.%f", 460 "%e": "%-d", 461 } 462 463 FORMAT_MAPPING = { 464 "DD": "%d", 465 "MM": "%m", 466 "MON": "%b", 467 "MONTH": "%B", 468 "YYYY": "%Y", 469 "YY": "%y", 470 "HH": "%I", 471 "HH12": "%I", 472 "HH24": "%H", 473 "MI": "%M", 474 "SS": "%S", 475 "SSSSS": "%f", 476 "TZH": "%z", 477 } 478 479 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 480 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 481 # https://cloud.google.com/bigquery/docs/querying-wildcard-tables#scanning_a_range_of_tables_using_table_suffix 482 # https://cloud.google.com/bigquery/docs/query-cloud-storage-data#query_the_file_name_pseudo-column 483 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE", "_TABLE_SUFFIX", "_FILE_NAME"} 484 485 # All set operations require either a DISTINCT or ALL specifier 486 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 487 488 # https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions#percentile_cont 489 COERCES_TO = { 490 **TypeAnnotator.COERCES_TO, 491 exp.DataType.Type.BIGDECIMAL: {exp.DataType.Type.DOUBLE}, 492 } 493 COERCES_TO[exp.DataType.Type.DECIMAL] |= {exp.DataType.Type.BIGDECIMAL} 494 COERCES_TO[exp.DataType.Type.BIGINT] |= {exp.DataType.Type.BIGDECIMAL} 495 496 # BigQuery maps Type.TIMESTAMP to DATETIME, so we need to amend the inferred types 497 TYPE_TO_EXPRESSIONS = { 498 **Dialect.TYPE_TO_EXPRESSIONS, 499 exp.DataType.Type.TIMESTAMPTZ: Dialect.TYPE_TO_EXPRESSIONS[exp.DataType.Type.TIMESTAMP], 500 } 501 TYPE_TO_EXPRESSIONS.pop(exp.DataType.Type.TIMESTAMP) 502 503 ANNOTATORS = { 504 **Dialect.ANNOTATORS, 505 **{ 506 expr_type: annotate_with_type_lambda(data_type) 507 for data_type, expressions in TYPE_TO_EXPRESSIONS.items() 508 for expr_type in expressions 509 }, 510 **{ 511 expr_type: lambda self, e: _annotate_math_functions(self, e) 512 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 513 }, 514 **{ 515 expr_type: lambda self, e: self._annotate_by_args(e, "this") 516 for expr_type in ( 517 exp.Abs, 518 exp.ArgMax, 519 exp.ArgMin, 520 exp.DateTrunc, 521 exp.DatetimeTrunc, 522 exp.FirstValue, 523 exp.GroupConcat, 524 exp.IgnoreNulls, 525 exp.JSONExtract, 526 exp.Lead, 527 exp.Left, 528 exp.Lower, 529 exp.NthValue, 530 exp.Pad, 531 exp.PercentileDisc, 532 exp.RegexpExtract, 533 exp.RegexpReplace, 534 exp.Repeat, 535 exp.Replace, 536 exp.RespectNulls, 537 exp.Reverse, 538 exp.Right, 539 exp.SafeNegate, 540 exp.Sign, 541 exp.Substring, 542 exp.TimestampTrunc, 543 exp.Translate, 544 exp.Trim, 545 exp.Upper, 546 ) 547 }, 548 exp.Acos: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 549 exp.Acosh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 550 exp.Asin: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 551 exp.Asinh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 552 exp.Atan: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 553 exp.Atanh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 554 exp.Atan2: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 555 exp.ApproxTopSum: lambda self, e: _annotate_by_args_approx_top(self, e), 556 exp.ApproxTopK: lambda self, e: _annotate_by_args_approx_top(self, e), 557 exp.ApproxQuantiles: lambda self, e: self._annotate_by_args(e, "this", array=True), 558 exp.Array: _annotate_array, 559 exp.ArrayConcat: lambda self, e: self._annotate_by_args(e, "this", "expressions"), 560 exp.Ascii: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 561 exp.BitwiseAndAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 562 exp.BitwiseOrAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 563 exp.BitwiseXorAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 564 exp.BitwiseCountAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 565 exp.ByteLength: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 566 exp.ByteString: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 567 exp.Cbrt: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 568 exp.CodePointsToBytes: lambda self, e: self._annotate_with_type( 569 e, exp.DataType.Type.BINARY 570 ), 571 exp.CodePointsToString: lambda self, e: self._annotate_with_type( 572 e, exp.DataType.Type.VARCHAR 573 ), 574 exp.Concat: _annotate_concat, 575 exp.Corr: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 576 exp.Cot: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 577 exp.CosineDistance: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 578 exp.Coth: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 579 exp.CovarPop: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 580 exp.CovarSamp: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 581 exp.Csc: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 582 exp.Csch: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 583 exp.CumeDist: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 584 exp.DateFromUnixDate: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DATE), 585 exp.DenseRank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 586 exp.EuclideanDistance: lambda self, e: self._annotate_with_type( 587 e, exp.DataType.Type.DOUBLE 588 ), 589 exp.FarmFingerprint: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 590 exp.Unhex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 591 exp.Float64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 592 exp.Format: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 593 exp.GenerateTimestampArray: lambda self, e: self._annotate_with_type( 594 e, exp.DataType.build("ARRAY<TIMESTAMP>", dialect="bigquery") 595 ), 596 exp.Grouping: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 597 exp.IsInf: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 598 exp.IsNan: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 599 exp.JSONArray: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 600 exp.JSONArrayAppend: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 601 exp.JSONArrayInsert: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 602 exp.JSONBool: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 603 exp.JSONExtractScalar: lambda self, e: self._annotate_with_type( 604 e, exp.DataType.Type.VARCHAR 605 ), 606 exp.JSONExtractArray: lambda self, e: self._annotate_by_args(e, "this", array=True), 607 exp.JSONFormat: lambda self, e: self._annotate_with_type( 608 e, exp.DataType.Type.JSON if e.args.get("to_json") else exp.DataType.Type.VARCHAR 609 ), 610 exp.JSONKeysAtDepth: lambda self, e: self._annotate_with_type( 611 e, exp.DataType.build("ARRAY<VARCHAR>", dialect="bigquery") 612 ), 613 exp.JSONObject: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 614 exp.JSONRemove: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 615 exp.JSONSet: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 616 exp.JSONStripNulls: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 617 exp.JSONType: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 618 exp.JSONValueArray: lambda self, e: self._annotate_with_type( 619 e, exp.DataType.build("ARRAY<VARCHAR>", dialect="bigquery") 620 ), 621 exp.Lag: lambda self, e: self._annotate_by_args(e, "this", "default"), 622 exp.LowerHex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 623 exp.LaxBool: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 624 exp.LaxFloat64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 625 exp.LaxInt64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 626 exp.LaxString: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 627 exp.MD5Digest: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 628 exp.Normalize: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 629 exp.Ntile: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 630 exp.ParseTime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 631 exp.ParseDatetime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DATETIME), 632 exp.ParseBignumeric: lambda self, e: self._annotate_with_type( 633 e, exp.DataType.Type.BIGDECIMAL 634 ), 635 exp.ParseNumeric: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DECIMAL), 636 exp.PercentileCont: lambda self, e: _annotate_by_args_with_coerce(self, e), 637 exp.PercentRank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 638 exp.Rank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 639 exp.RangeBucket: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 640 exp.RegexpExtractAll: lambda self, e: self._annotate_by_args(e, "this", array=True), 641 exp.RegexpInstr: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 642 exp.RowNumber: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 643 exp.Rand: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 644 exp.SafeConvertBytesToString: lambda self, e: self._annotate_with_type( 645 e, exp.DataType.Type.VARCHAR 646 ), 647 exp.SafeAdd: lambda self, e: _annotate_by_args_with_coerce(self, e), 648 exp.SafeMultiply: lambda self, e: _annotate_by_args_with_coerce(self, e), 649 exp.SafeSubtract: lambda self, e: _annotate_by_args_with_coerce(self, e), 650 exp.Sec: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 651 exp.Sech: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 652 exp.Soundex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 653 exp.SHA: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 654 exp.SHA2: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 655 exp.Sin: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 656 exp.Sinh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 657 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 658 exp.TimestampFromParts: lambda self, e: self._annotate_with_type( 659 e, exp.DataType.Type.DATETIME 660 ), 661 exp.TimeFromParts: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 662 exp.TimeTrunc: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 663 exp.ToCodePoints: lambda self, e: self._annotate_with_type( 664 e, exp.DataType.build("ARRAY<BIGINT>", dialect="bigquery") 665 ), 666 exp.TsOrDsToTime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 667 exp.Unicode: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 668 exp.Uuid: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 669 } 670 671 def normalize_identifier(self, expression: E) -> E: 672 if ( 673 isinstance(expression, exp.Identifier) 674 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 675 ): 676 parent = expression.parent 677 while isinstance(parent, exp.Dot): 678 parent = parent.parent 679 680 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 681 # by default. The following check uses a heuristic to detect tables based on whether 682 # they are qualified. This should generally be correct, because tables in BigQuery 683 # must be qualified with at least a dataset, unless @@dataset_id is set. 684 case_sensitive = ( 685 isinstance(parent, exp.UserDefinedFunction) 686 or ( 687 isinstance(parent, exp.Table) 688 and parent.db 689 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 690 ) 691 or expression.meta.get("is_table") 692 ) 693 if not case_sensitive: 694 expression.set("this", expression.this.lower()) 695 696 return t.cast(E, expression) 697 698 return super().normalize_identifier(expression) 699 700 class JSONPathTokenizer(jsonpath.JSONPathTokenizer): 701 VAR_TOKENS = { 702 TokenType.DASH, 703 TokenType.VAR, 704 } 705 706 class Tokenizer(tokens.Tokenizer): 707 QUOTES = ["'", '"', '"""', "'''"] 708 COMMENTS = ["--", "#", ("/*", "*/")] 709 IDENTIFIERS = ["`"] 710 STRING_ESCAPES = ["\\"] 711 712 HEX_STRINGS = [("0x", ""), ("0X", "")] 713 714 BYTE_STRINGS = [ 715 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 716 ] 717 718 RAW_STRINGS = [ 719 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 720 ] 721 722 NESTED_COMMENTS = False 723 724 KEYWORDS = { 725 **tokens.Tokenizer.KEYWORDS, 726 "ANY TYPE": TokenType.VARIANT, 727 "BEGIN": TokenType.COMMAND, 728 "BEGIN TRANSACTION": TokenType.BEGIN, 729 "BYTEINT": TokenType.INT, 730 "BYTES": TokenType.BINARY, 731 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 732 "DATETIME": TokenType.TIMESTAMP, 733 "DECLARE": TokenType.DECLARE, 734 "ELSEIF": TokenType.COMMAND, 735 "EXCEPTION": TokenType.COMMAND, 736 "EXPORT": TokenType.EXPORT, 737 "FLOAT64": TokenType.DOUBLE, 738 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 739 "LOOP": TokenType.COMMAND, 740 "MODEL": TokenType.MODEL, 741 "NOT DETERMINISTIC": TokenType.VOLATILE, 742 "RECORD": TokenType.STRUCT, 743 "REPEAT": TokenType.COMMAND, 744 "TIMESTAMP": TokenType.TIMESTAMPTZ, 745 "WHILE": TokenType.COMMAND, 746 } 747 KEYWORDS.pop("DIV") 748 KEYWORDS.pop("VALUES") 749 KEYWORDS.pop("/*+") 750 751 class Parser(parser.Parser): 752 PREFIXED_PIVOT_COLUMNS = True 753 LOG_DEFAULTS_TO_LN = True 754 SUPPORTS_IMPLICIT_UNNEST = True 755 JOINS_HAVE_EQUAL_PRECEDENCE = True 756 757 # BigQuery does not allow ASC/DESC to be used as an identifier, allows GRANT as an identifier 758 ID_VAR_TOKENS = { 759 *parser.Parser.ID_VAR_TOKENS, 760 TokenType.GRANT, 761 } - {TokenType.ASC, TokenType.DESC} 762 763 ALIAS_TOKENS = { 764 *parser.Parser.ALIAS_TOKENS, 765 TokenType.GRANT, 766 } - {TokenType.ASC, TokenType.DESC} 767 768 TABLE_ALIAS_TOKENS = { 769 *parser.Parser.TABLE_ALIAS_TOKENS, 770 TokenType.GRANT, 771 } - {TokenType.ASC, TokenType.DESC} 772 773 COMMENT_TABLE_ALIAS_TOKENS = { 774 *parser.Parser.COMMENT_TABLE_ALIAS_TOKENS, 775 TokenType.GRANT, 776 } - {TokenType.ASC, TokenType.DESC} 777 778 UPDATE_ALIAS_TOKENS = { 779 *parser.Parser.UPDATE_ALIAS_TOKENS, 780 TokenType.GRANT, 781 } - {TokenType.ASC, TokenType.DESC} 782 783 FUNCTIONS = { 784 **parser.Parser.FUNCTIONS, 785 "APPROX_TOP_COUNT": exp.ApproxTopK.from_arg_list, 786 "BIT_AND": exp.BitwiseAndAgg.from_arg_list, 787 "BIT_OR": exp.BitwiseOrAgg.from_arg_list, 788 "BIT_XOR": exp.BitwiseXorAgg.from_arg_list, 789 "BIT_COUNT": exp.BitwiseCountAgg.from_arg_list, 790 "BOOL": exp.JSONBool.from_arg_list, 791 "CONTAINS_SUBSTR": _build_contains_substring, 792 "DATE": _build_date, 793 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 794 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 795 "DATE_TRUNC": lambda args: exp.DateTrunc( 796 unit=seq_get(args, 1), 797 this=seq_get(args, 0), 798 zone=seq_get(args, 2), 799 ), 800 "DATETIME": _build_datetime, 801 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 802 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 803 "DIV": binary_from_function(exp.IntDiv), 804 "EDIT_DISTANCE": _build_levenshtein, 805 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 806 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 807 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 808 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 809 "JSON_EXTRACT_STRING_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 810 "JSON_KEYS": exp.JSONKeysAtDepth.from_arg_list, 811 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 812 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 813 "JSON_STRIP_NULLS": _build_json_strip_nulls, 814 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 815 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 816 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 817 "MD5": exp.MD5Digest.from_arg_list, 818 "NORMALIZE_AND_CASEFOLD": lambda args: exp.Normalize( 819 this=seq_get(args, 0), form=seq_get(args, 1), is_casefold=True 820 ), 821 "OCTET_LENGTH": exp.ByteLength.from_arg_list, 822 "TO_HEX": _build_to_hex, 823 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 824 [seq_get(args, 1), seq_get(args, 0)] 825 ), 826 "PARSE_TIME": lambda args: build_formatted_time(exp.ParseTime, "bigquery")( 827 [seq_get(args, 1), seq_get(args, 0)] 828 ), 829 "PARSE_TIMESTAMP": _build_parse_timestamp, 830 "PARSE_DATETIME": lambda args: build_formatted_time(exp.ParseDatetime, "bigquery")( 831 [seq_get(args, 1), seq_get(args, 0)] 832 ), 833 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 834 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 835 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 836 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 837 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 838 ), 839 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 840 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 841 "SPLIT": lambda args: exp.Split( 842 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 843 this=seq_get(args, 0), 844 expression=seq_get(args, 1) or exp.Literal.string(","), 845 ), 846 "STRPOS": exp.StrPosition.from_arg_list, 847 "TIME": _build_time, 848 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 849 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 850 "TIMESTAMP": _build_timestamp, 851 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 852 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 853 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 854 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 855 ), 856 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 857 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 858 ), 859 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 860 "TO_JSON": lambda args: exp.JSONFormat( 861 this=seq_get(args, 0), options=seq_get(args, 1), to_json=True 862 ), 863 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 864 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 865 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 866 "FORMAT_TIME": _build_format_time(exp.TsOrDsToTime), 867 "FROM_HEX": exp.Unhex.from_arg_list, 868 "WEEK": lambda args: exp.WeekStart(this=exp.var(seq_get(args, 0))), 869 } 870 871 FUNCTION_PARSERS = { 872 **parser.Parser.FUNCTION_PARSERS, 873 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 874 "JSON_ARRAY": lambda self: self.expression( 875 exp.JSONArray, expressions=self._parse_csv(self._parse_bitwise) 876 ), 877 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 878 "PREDICT": lambda self: self._parse_ml(exp.Predict), 879 "TRANSLATE": lambda self: self._parse_translate(), 880 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 881 "GENERATE_EMBEDDING": lambda self: self._parse_ml(exp.GenerateEmbedding), 882 "GENERATE_TEXT_EMBEDDING": lambda self: self._parse_ml( 883 exp.GenerateEmbedding, is_text=True 884 ), 885 "VECTOR_SEARCH": lambda self: self._parse_vector_search(), 886 "FORECAST": lambda self: self._parse_ml(exp.MLForecast), 887 } 888 FUNCTION_PARSERS.pop("TRIM") 889 890 NO_PAREN_FUNCTIONS = { 891 **parser.Parser.NO_PAREN_FUNCTIONS, 892 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 893 } 894 895 NESTED_TYPE_TOKENS = { 896 *parser.Parser.NESTED_TYPE_TOKENS, 897 TokenType.TABLE, 898 } 899 900 PROPERTY_PARSERS = { 901 **parser.Parser.PROPERTY_PARSERS, 902 "NOT DETERMINISTIC": lambda self: self.expression( 903 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 904 ), 905 "OPTIONS": lambda self: self._parse_with_property(), 906 } 907 908 CONSTRAINT_PARSERS = { 909 **parser.Parser.CONSTRAINT_PARSERS, 910 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 911 } 912 913 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 914 RANGE_PARSERS.pop(TokenType.OVERLAPS) 915 916 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 917 918 STATEMENT_PARSERS = { 919 **parser.Parser.STATEMENT_PARSERS, 920 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 921 TokenType.END: lambda self: self._parse_as_command(self._prev), 922 TokenType.FOR: lambda self: self._parse_for_in(), 923 TokenType.EXPORT: lambda self: self._parse_export_data(), 924 TokenType.DECLARE: lambda self: self._parse_declare(), 925 } 926 927 BRACKET_OFFSETS = { 928 "OFFSET": (0, False), 929 "ORDINAL": (1, False), 930 "SAFE_OFFSET": (0, True), 931 "SAFE_ORDINAL": (1, True), 932 } 933 934 def _parse_for_in(self) -> t.Union[exp.ForIn, exp.Command]: 935 index = self._index 936 this = self._parse_range() 937 self._match_text_seq("DO") 938 if self._match(TokenType.COMMAND): 939 self._retreat(index) 940 return self._parse_as_command(self._prev) 941 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 942 943 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 944 this = super()._parse_table_part(schema=schema) or self._parse_number() 945 946 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 947 if isinstance(this, exp.Identifier): 948 table_name = this.name 949 while self._match(TokenType.DASH, advance=False) and self._next: 950 start = self._curr 951 while self._is_connected() and not self._match_set( 952 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 953 ): 954 self._advance() 955 956 if start == self._curr: 957 break 958 959 table_name += self._find_sql(start, self._prev) 960 961 this = exp.Identifier( 962 this=table_name, quoted=this.args.get("quoted") 963 ).update_positions(this) 964 elif isinstance(this, exp.Literal): 965 table_name = this.name 966 967 if self._is_connected() and self._parse_var(any_token=True): 968 table_name += self._prev.text 969 970 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 971 972 return this 973 974 def _parse_table_parts( 975 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 976 ) -> exp.Table: 977 table = super()._parse_table_parts( 978 schema=schema, is_db_reference=is_db_reference, wildcard=True 979 ) 980 981 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 982 if not table.catalog: 983 if table.db: 984 previous_db = table.args["db"] 985 parts = table.db.split(".") 986 if len(parts) == 2 and not table.args["db"].quoted: 987 table.set( 988 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 989 ) 990 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 991 else: 992 previous_this = table.this 993 parts = table.name.split(".") 994 if len(parts) == 2 and not table.this.quoted: 995 table.set( 996 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 997 ) 998 table.set( 999 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 1000 ) 1001 1002 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 1003 alias = table.this 1004 catalog, db, this, *rest = ( 1005 exp.to_identifier(p, quoted=True) 1006 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 1007 ) 1008 1009 for part in (catalog, db, this): 1010 if part: 1011 part.update_positions(table.this) 1012 1013 if rest and this: 1014 this = exp.Dot.build([this, *rest]) # type: ignore 1015 1016 table = exp.Table( 1017 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 1018 ) 1019 table.meta["quoted_table"] = True 1020 else: 1021 alias = None 1022 1023 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 1024 # dataset, so if the project identifier is omitted we need to fix the ast so that 1025 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 1026 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 1027 # views, because it would seem like the "catalog" part is set, when it'd actually 1028 # be the region/dataset. Merging the two identifiers into a single one is done to 1029 # avoid producing a 4-part Table reference, which would cause issues in the schema 1030 # module, when there are 3-part table names mixed with information schema views. 1031 # 1032 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 1033 table_parts = table.parts 1034 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 1035 # We need to alias the table here to avoid breaking existing qualified columns. 1036 # This is expected to be safe, because if there's an actual alias coming up in 1037 # the token stream, it will overwrite this one. If there isn't one, we are only 1038 # exposing the name that can be used to reference the view explicitly (a no-op). 1039 exp.alias_( 1040 table, 1041 t.cast(exp.Identifier, alias or table_parts[-1]), 1042 table=True, 1043 copy=False, 1044 ) 1045 1046 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 1047 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 1048 line=table_parts[-2].meta.get("line"), 1049 col=table_parts[-1].meta.get("col"), 1050 start=table_parts[-2].meta.get("start"), 1051 end=table_parts[-1].meta.get("end"), 1052 ) 1053 table.set("this", new_this) 1054 table.set("db", seq_get(table_parts, -3)) 1055 table.set("catalog", seq_get(table_parts, -4)) 1056 1057 return table 1058 1059 def _parse_column(self) -> t.Optional[exp.Expression]: 1060 column = super()._parse_column() 1061 if isinstance(column, exp.Column): 1062 parts = column.parts 1063 if any("." in p.name for p in parts): 1064 catalog, db, table, this, *rest = ( 1065 exp.to_identifier(p, quoted=True) 1066 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 1067 ) 1068 1069 if rest and this: 1070 this = exp.Dot.build([this, *rest]) # type: ignore 1071 1072 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 1073 column.meta["quoted_column"] = True 1074 1075 return column 1076 1077 @t.overload 1078 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 1079 1080 @t.overload 1081 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 1082 1083 def _parse_json_object(self, agg=False): 1084 json_object = super()._parse_json_object() 1085 array_kv_pair = seq_get(json_object.expressions, 0) 1086 1087 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 1088 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 1089 if ( 1090 array_kv_pair 1091 and isinstance(array_kv_pair.this, exp.Array) 1092 and isinstance(array_kv_pair.expression, exp.Array) 1093 ): 1094 keys = array_kv_pair.this.expressions 1095 values = array_kv_pair.expression.expressions 1096 1097 json_object.set( 1098 "expressions", 1099 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 1100 ) 1101 1102 return json_object 1103 1104 def _parse_bracket( 1105 self, this: t.Optional[exp.Expression] = None 1106 ) -> t.Optional[exp.Expression]: 1107 bracket = super()._parse_bracket(this) 1108 1109 if this is bracket: 1110 return bracket 1111 1112 if isinstance(bracket, exp.Bracket): 1113 for expression in bracket.expressions: 1114 name = expression.name.upper() 1115 1116 if name not in self.BRACKET_OFFSETS: 1117 break 1118 1119 offset, safe = self.BRACKET_OFFSETS[name] 1120 bracket.set("offset", offset) 1121 bracket.set("safe", safe) 1122 expression.replace(expression.expressions[0]) 1123 1124 return bracket 1125 1126 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 1127 unnest = super()._parse_unnest(with_alias=with_alias) 1128 1129 if not unnest: 1130 return None 1131 1132 unnest_expr = seq_get(unnest.expressions, 0) 1133 if unnest_expr: 1134 from sqlglot.optimizer.annotate_types import annotate_types 1135 1136 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 1137 1138 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 1139 # in contrast to other dialects such as DuckDB which flattens only the array by default 1140 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 1141 array_elem.is_type(exp.DataType.Type.STRUCT) 1142 for array_elem in unnest_expr._type.expressions 1143 ): 1144 unnest.set("explode_array", True) 1145 1146 return unnest 1147 1148 def _parse_make_interval(self) -> exp.MakeInterval: 1149 expr = exp.MakeInterval() 1150 1151 for arg_key in expr.arg_types: 1152 value = self._parse_lambda() 1153 1154 if not value: 1155 break 1156 1157 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 1158 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 1159 if isinstance(value, exp.Kwarg): 1160 arg_key = value.this.name 1161 1162 expr.set(arg_key, value) 1163 1164 self._match(TokenType.COMMA) 1165 1166 return expr 1167 1168 def _parse_ml(self, expr_type: t.Type[E], **kwargs) -> E: 1169 self._match_text_seq("MODEL") 1170 this = self._parse_table() 1171 1172 self._match(TokenType.COMMA) 1173 self._match_text_seq("TABLE") 1174 1175 # Certain functions like ML.FORECAST require a STRUCT argument but not a TABLE/SELECT one 1176 expression = ( 1177 self._parse_table() if not self._match(TokenType.STRUCT, advance=False) else None 1178 ) 1179 1180 self._match(TokenType.COMMA) 1181 1182 return self.expression( 1183 expr_type, 1184 this=this, 1185 expression=expression, 1186 params_struct=self._parse_bitwise(), 1187 **kwargs, 1188 ) 1189 1190 def _parse_translate(self) -> exp.Translate | exp.MLTranslate: 1191 # Check if this is ML.TRANSLATE by looking at previous tokens 1192 token = seq_get(self._tokens, self._index - 4) 1193 if token and token.text.upper() == "ML": 1194 return self._parse_ml(exp.MLTranslate) 1195 1196 return exp.Translate.from_arg_list(self._parse_function_args()) 1197 1198 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 1199 self._match(TokenType.TABLE) 1200 this = self._parse_table() 1201 1202 expr = self.expression(exp.FeaturesAtTime, this=this) 1203 1204 while self._match(TokenType.COMMA): 1205 arg = self._parse_lambda() 1206 1207 # Get the LHS of the Kwarg and set the arg to that value, e.g 1208 # "num_rows => 1" sets the expr's `num_rows` arg 1209 if arg: 1210 expr.set(arg.this.name, arg) 1211 1212 return expr 1213 1214 def _parse_vector_search(self) -> exp.VectorSearch: 1215 self._match(TokenType.TABLE) 1216 base_table = self._parse_table() 1217 1218 self._match(TokenType.COMMA) 1219 1220 column_to_search = self._parse_bitwise() 1221 self._match(TokenType.COMMA) 1222 1223 self._match(TokenType.TABLE) 1224 query_table = self._parse_table() 1225 1226 expr = self.expression( 1227 exp.VectorSearch, 1228 this=base_table, 1229 column_to_search=column_to_search, 1230 query_table=query_table, 1231 ) 1232 1233 while self._match(TokenType.COMMA): 1234 # query_column_to_search can be named argument or positional 1235 if self._match(TokenType.STRING, advance=False): 1236 query_column = self._parse_string() 1237 expr.set("query_column_to_search", query_column) 1238 else: 1239 arg = self._parse_lambda() 1240 if arg: 1241 expr.set(arg.this.name, arg) 1242 1243 return expr 1244 1245 def _parse_export_data(self) -> exp.Export: 1246 self._match_text_seq("DATA") 1247 1248 return self.expression( 1249 exp.Export, 1250 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 1251 options=self._parse_properties(), 1252 this=self._match_text_seq("AS") and self._parse_select(), 1253 ) 1254 1255 class Generator(generator.Generator): 1256 INTERVAL_ALLOWS_PLURAL_FORM = False 1257 JOIN_HINTS = False 1258 QUERY_HINTS = False 1259 TABLE_HINTS = False 1260 LIMIT_FETCH = "LIMIT" 1261 RENAME_TABLE_WITH_DB = False 1262 NVL2_SUPPORTED = False 1263 UNNEST_WITH_ORDINALITY = False 1264 COLLATE_IS_FUNC = True 1265 LIMIT_ONLY_LITERALS = True 1266 SUPPORTS_TABLE_ALIAS_COLUMNS = False 1267 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 1268 JSON_KEY_VALUE_PAIR_SEP = "," 1269 NULL_ORDERING_SUPPORTED = False 1270 IGNORE_NULLS_IN_FUNC = True 1271 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 1272 CAN_IMPLEMENT_ARRAY_ANY = True 1273 SUPPORTS_TO_NUMBER = False 1274 NAMED_PLACEHOLDER_TOKEN = "@" 1275 HEX_FUNC = "TO_HEX" 1276 WITH_PROPERTIES_PREFIX = "OPTIONS" 1277 SUPPORTS_EXPLODING_PROJECTIONS = False 1278 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 1279 SUPPORTS_UNIX_SECONDS = True 1280 1281 SAFE_JSON_PATH_KEY_RE = re.compile(r"^[_\-a-zA-Z][\-\w]*$") 1282 1283 TS_OR_DS_TYPES = ( 1284 exp.TsOrDsToDatetime, 1285 exp.TsOrDsToTimestamp, 1286 exp.TsOrDsToTime, 1287 exp.TsOrDsToDate, 1288 ) 1289 1290 TRANSFORMS = { 1291 **generator.Generator.TRANSFORMS, 1292 exp.ApproxTopK: rename_func("APPROX_TOP_COUNT"), 1293 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 1294 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 1295 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 1296 exp.Array: inline_array_unless_query, 1297 exp.ArrayContains: _array_contains_sql, 1298 exp.ArrayFilter: filter_array_using_unnest, 1299 exp.ArrayRemove: filter_array_using_unnest, 1300 exp.BitwiseAndAgg: rename_func("BIT_AND"), 1301 exp.BitwiseOrAgg: rename_func("BIT_OR"), 1302 exp.BitwiseXorAgg: rename_func("BIT_XOR"), 1303 exp.BitwiseCountAgg: rename_func("BIT_COUNT"), 1304 exp.ByteLength: rename_func("BYTE_LENGTH"), 1305 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 1306 exp.CollateProperty: lambda self, e: ( 1307 f"DEFAULT COLLATE {self.sql(e, 'this')}" 1308 if e.args.get("default") 1309 else f"COLLATE {self.sql(e, 'this')}" 1310 ), 1311 exp.Commit: lambda *_: "COMMIT TRANSACTION", 1312 exp.CountIf: rename_func("COUNTIF"), 1313 exp.Create: _create_sql, 1314 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 1315 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 1316 exp.DateDiff: lambda self, e: self.func( 1317 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 1318 ), 1319 exp.DateFromParts: rename_func("DATE"), 1320 exp.DateStrToDate: datestrtodate_sql, 1321 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 1322 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 1323 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 1324 exp.DateFromUnixDate: rename_func("DATE_FROM_UNIX_DATE"), 1325 exp.FromTimeZone: lambda self, e: self.func( 1326 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 1327 ), 1328 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 1329 exp.GroupConcat: lambda self, e: groupconcat_sql( 1330 self, e, func_name="STRING_AGG", within_group=False 1331 ), 1332 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 1333 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 1334 exp.If: if_sql(false_value="NULL"), 1335 exp.ILike: no_ilike_sql, 1336 exp.IntDiv: rename_func("DIV"), 1337 exp.Int64: rename_func("INT64"), 1338 exp.JSONBool: rename_func("BOOL"), 1339 exp.JSONExtract: _json_extract_sql, 1340 exp.JSONExtractArray: _json_extract_sql, 1341 exp.JSONExtractScalar: _json_extract_sql, 1342 exp.JSONFormat: lambda self, e: self.func( 1343 "TO_JSON" if e.args.get("to_json") else "TO_JSON_STRING", 1344 e.this, 1345 e.args.get("options"), 1346 ), 1347 exp.JSONKeysAtDepth: rename_func("JSON_KEYS"), 1348 exp.JSONValueArray: rename_func("JSON_VALUE_ARRAY"), 1349 exp.Levenshtein: _levenshtein_sql, 1350 exp.Max: max_or_greatest, 1351 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 1352 exp.MD5Digest: rename_func("MD5"), 1353 exp.Min: min_or_least, 1354 exp.Normalize: lambda self, e: self.func( 1355 "NORMALIZE_AND_CASEFOLD" if e.args.get("is_casefold") else "NORMALIZE", 1356 e.this, 1357 e.args.get("form"), 1358 ), 1359 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1360 exp.RegexpExtract: lambda self, e: self.func( 1361 "REGEXP_EXTRACT", 1362 e.this, 1363 e.expression, 1364 e.args.get("position"), 1365 e.args.get("occurrence"), 1366 ), 1367 exp.RegexpExtractAll: lambda self, e: self.func( 1368 "REGEXP_EXTRACT_ALL", e.this, e.expression 1369 ), 1370 exp.RegexpReplace: regexp_replace_sql, 1371 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1372 exp.ReturnsProperty: _returnsproperty_sql, 1373 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1374 exp.ParseTime: lambda self, e: self.func("PARSE_TIME", self.format_time(e), e.this), 1375 exp.ParseDatetime: lambda self, e: self.func( 1376 "PARSE_DATETIME", self.format_time(e), e.this 1377 ), 1378 exp.Select: transforms.preprocess( 1379 [ 1380 transforms.explode_projection_to_unnest(), 1381 transforms.unqualify_unnest, 1382 transforms.eliminate_distinct_on, 1383 _alias_ordered_group, 1384 transforms.eliminate_semi_and_anti_joins, 1385 ] 1386 ), 1387 exp.SHA: rename_func("SHA1"), 1388 exp.SHA2: sha256_sql, 1389 exp.StabilityProperty: lambda self, e: ( 1390 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1391 ), 1392 exp.String: rename_func("STRING"), 1393 exp.StrPosition: lambda self, e: ( 1394 strposition_sql( 1395 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1396 ) 1397 ), 1398 exp.StrToDate: _str_to_datetime_sql, 1399 exp.StrToTime: _str_to_datetime_sql, 1400 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1401 exp.TimeFromParts: rename_func("TIME"), 1402 exp.TimestampFromParts: rename_func("DATETIME"), 1403 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1404 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1405 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1406 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1407 exp.TimeStrToTime: timestrtotime_sql, 1408 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1409 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1410 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1411 exp.TsOrDsToTime: rename_func("TIME"), 1412 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1413 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1414 exp.Unhex: rename_func("FROM_HEX"), 1415 exp.UnixDate: rename_func("UNIX_DATE"), 1416 exp.UnixToTime: _unix_to_time_sql, 1417 exp.Uuid: lambda *_: "GENERATE_UUID()", 1418 exp.Values: _derived_table_values_to_unnest, 1419 exp.VariancePop: rename_func("VAR_POP"), 1420 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1421 } 1422 1423 SUPPORTED_JSON_PATH_PARTS = { 1424 exp.JSONPathKey, 1425 exp.JSONPathRoot, 1426 exp.JSONPathSubscript, 1427 } 1428 1429 TYPE_MAPPING = { 1430 **generator.Generator.TYPE_MAPPING, 1431 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1432 exp.DataType.Type.BIGINT: "INT64", 1433 exp.DataType.Type.BINARY: "BYTES", 1434 exp.DataType.Type.BLOB: "BYTES", 1435 exp.DataType.Type.BOOLEAN: "BOOL", 1436 exp.DataType.Type.CHAR: "STRING", 1437 exp.DataType.Type.DECIMAL: "NUMERIC", 1438 exp.DataType.Type.DOUBLE: "FLOAT64", 1439 exp.DataType.Type.FLOAT: "FLOAT64", 1440 exp.DataType.Type.INT: "INT64", 1441 exp.DataType.Type.NCHAR: "STRING", 1442 exp.DataType.Type.NVARCHAR: "STRING", 1443 exp.DataType.Type.SMALLINT: "INT64", 1444 exp.DataType.Type.TEXT: "STRING", 1445 exp.DataType.Type.TIMESTAMP: "DATETIME", 1446 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1447 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1448 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1449 exp.DataType.Type.TINYINT: "INT64", 1450 exp.DataType.Type.ROWVERSION: "BYTES", 1451 exp.DataType.Type.UUID: "STRING", 1452 exp.DataType.Type.VARBINARY: "BYTES", 1453 exp.DataType.Type.VARCHAR: "STRING", 1454 exp.DataType.Type.VARIANT: "ANY TYPE", 1455 } 1456 1457 PROPERTIES_LOCATION = { 1458 **generator.Generator.PROPERTIES_LOCATION, 1459 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1460 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1461 } 1462 1463 # WINDOW comes after QUALIFY 1464 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1465 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1466 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1467 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1468 } 1469 1470 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1471 RESERVED_KEYWORDS = { 1472 "all", 1473 "and", 1474 "any", 1475 "array", 1476 "as", 1477 "asc", 1478 "assert_rows_modified", 1479 "at", 1480 "between", 1481 "by", 1482 "case", 1483 "cast", 1484 "collate", 1485 "contains", 1486 "create", 1487 "cross", 1488 "cube", 1489 "current", 1490 "default", 1491 "define", 1492 "desc", 1493 "distinct", 1494 "else", 1495 "end", 1496 "enum", 1497 "escape", 1498 "except", 1499 "exclude", 1500 "exists", 1501 "extract", 1502 "false", 1503 "fetch", 1504 "following", 1505 "for", 1506 "from", 1507 "full", 1508 "group", 1509 "grouping", 1510 "groups", 1511 "hash", 1512 "having", 1513 "if", 1514 "ignore", 1515 "in", 1516 "inner", 1517 "intersect", 1518 "interval", 1519 "into", 1520 "is", 1521 "join", 1522 "lateral", 1523 "left", 1524 "like", 1525 "limit", 1526 "lookup", 1527 "merge", 1528 "natural", 1529 "new", 1530 "no", 1531 "not", 1532 "null", 1533 "nulls", 1534 "of", 1535 "on", 1536 "or", 1537 "order", 1538 "outer", 1539 "over", 1540 "partition", 1541 "preceding", 1542 "proto", 1543 "qualify", 1544 "range", 1545 "recursive", 1546 "respect", 1547 "right", 1548 "rollup", 1549 "rows", 1550 "select", 1551 "set", 1552 "some", 1553 "struct", 1554 "tablesample", 1555 "then", 1556 "to", 1557 "treat", 1558 "true", 1559 "unbounded", 1560 "union", 1561 "unnest", 1562 "using", 1563 "when", 1564 "where", 1565 "window", 1566 "with", 1567 "within", 1568 } 1569 1570 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1571 unit = expression.unit 1572 unit_sql = unit.name if unit.is_string else self.sql(unit) 1573 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1574 1575 def mod_sql(self, expression: exp.Mod) -> str: 1576 this = expression.this 1577 expr = expression.expression 1578 return self.func( 1579 "MOD", 1580 this.unnest() if isinstance(this, exp.Paren) else this, 1581 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1582 ) 1583 1584 def column_parts(self, expression: exp.Column) -> str: 1585 if expression.meta.get("quoted_column"): 1586 # If a column reference is of the form `dataset.table`.name, we need 1587 # to preserve the quoted table path, otherwise the reference breaks 1588 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1589 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1590 return f"{table_path}.{self.sql(expression, 'this')}" 1591 1592 return super().column_parts(expression) 1593 1594 def table_parts(self, expression: exp.Table) -> str: 1595 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1596 # we need to make sure the correct quoting is used in each case. 1597 # 1598 # For example, if there is a CTE x that clashes with a schema name, then the former will 1599 # return the table y in that schema, whereas the latter will return the CTE's y column: 1600 # 1601 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1602 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1603 if expression.meta.get("quoted_table"): 1604 table_parts = ".".join(p.name for p in expression.parts) 1605 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1606 1607 return super().table_parts(expression) 1608 1609 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1610 this = expression.this 1611 if isinstance(this, exp.TsOrDsToDatetime): 1612 func_name = "FORMAT_DATETIME" 1613 elif isinstance(this, exp.TsOrDsToTimestamp): 1614 func_name = "FORMAT_TIMESTAMP" 1615 elif isinstance(this, exp.TsOrDsToTime): 1616 func_name = "FORMAT_TIME" 1617 else: 1618 func_name = "FORMAT_DATE" 1619 1620 time_expr = this if isinstance(this, self.TS_OR_DS_TYPES) else expression 1621 return self.func( 1622 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1623 ) 1624 1625 def eq_sql(self, expression: exp.EQ) -> str: 1626 # Operands of = cannot be NULL in BigQuery 1627 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1628 if not isinstance(expression.parent, exp.Update): 1629 return "NULL" 1630 1631 return self.binary(expression, "=") 1632 1633 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1634 parent = expression.parent 1635 1636 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1637 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1638 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1639 return self.func( 1640 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1641 ) 1642 1643 return super().attimezone_sql(expression) 1644 1645 def trycast_sql(self, expression: exp.TryCast) -> str: 1646 return self.cast_sql(expression, safe_prefix="SAFE_") 1647 1648 def bracket_sql(self, expression: exp.Bracket) -> str: 1649 this = expression.this 1650 expressions = expression.expressions 1651 1652 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1653 arg = expressions[0] 1654 if arg.type is None: 1655 from sqlglot.optimizer.annotate_types import annotate_types 1656 1657 arg = annotate_types(arg, dialect=self.dialect) 1658 1659 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1660 # BQ doesn't support bracket syntax with string values for structs 1661 return f"{self.sql(this)}.{arg.name}" 1662 1663 expressions_sql = self.expressions(expression, flat=True) 1664 offset = expression.args.get("offset") 1665 1666 if offset == 0: 1667 expressions_sql = f"OFFSET({expressions_sql})" 1668 elif offset == 1: 1669 expressions_sql = f"ORDINAL({expressions_sql})" 1670 elif offset is not None: 1671 self.unsupported(f"Unsupported array offset: {offset}") 1672 1673 if expression.args.get("safe"): 1674 expressions_sql = f"SAFE_{expressions_sql}" 1675 1676 return f"{self.sql(this)}[{expressions_sql}]" 1677 1678 def in_unnest_op(self, expression: exp.Unnest) -> str: 1679 return self.sql(expression) 1680 1681 def version_sql(self, expression: exp.Version) -> str: 1682 if expression.name == "TIMESTAMP": 1683 expression.set("this", "SYSTEM_TIME") 1684 return super().version_sql(expression) 1685 1686 def contains_sql(self, expression: exp.Contains) -> str: 1687 this = expression.this 1688 expr = expression.expression 1689 1690 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1691 this = this.this 1692 expr = expr.this 1693 1694 return self.func("CONTAINS_SUBSTR", this, expr, expression.args.get("json_scope")) 1695 1696 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1697 this = expression.this 1698 1699 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1700 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1701 # because they aren't literals and so the above syntax is invalid BigQuery. 1702 if isinstance(this, exp.Array): 1703 elem = seq_get(this.expressions, 0) 1704 if not (elem and elem.find(exp.Query)): 1705 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1706 1707 return super().cast_sql(expression, safe_prefix=safe_prefix) 1708 1709 def declareitem_sql(self, expression: exp.DeclareItem) -> str: 1710 variables = self.expressions(expression, "this") 1711 default = self.sql(expression, "default") 1712 default = f" DEFAULT {default}" if default else "" 1713 kind = self.sql(expression, "kind") 1714 kind = f" {kind}" if kind else "" 1715 1716 return f"{variables}{kind}{default}"
440class BigQuery(Dialect): 441 WEEK_OFFSET = -1 442 UNNEST_COLUMN_ONLY = True 443 SUPPORTS_USER_DEFINED_TYPES = False 444 SUPPORTS_SEMI_ANTI_JOIN = False 445 LOG_BASE_FIRST = False 446 HEX_LOWERCASE = True 447 FORCE_EARLY_ALIAS_REF_EXPANSION = True 448 PRESERVE_ORIGINAL_NAMES = True 449 HEX_STRING_IS_INTEGER_TYPE = True 450 451 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 452 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 453 454 # bigquery udfs are case sensitive 455 NORMALIZE_FUNCTIONS = False 456 457 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 458 TIME_MAPPING = { 459 "%D": "%m/%d/%y", 460 "%E6S": "%S.%f", 461 "%e": "%-d", 462 } 463 464 FORMAT_MAPPING = { 465 "DD": "%d", 466 "MM": "%m", 467 "MON": "%b", 468 "MONTH": "%B", 469 "YYYY": "%Y", 470 "YY": "%y", 471 "HH": "%I", 472 "HH12": "%I", 473 "HH24": "%H", 474 "MI": "%M", 475 "SS": "%S", 476 "SSSSS": "%f", 477 "TZH": "%z", 478 } 479 480 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 481 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 482 # https://cloud.google.com/bigquery/docs/querying-wildcard-tables#scanning_a_range_of_tables_using_table_suffix 483 # https://cloud.google.com/bigquery/docs/query-cloud-storage-data#query_the_file_name_pseudo-column 484 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE", "_TABLE_SUFFIX", "_FILE_NAME"} 485 486 # All set operations require either a DISTINCT or ALL specifier 487 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 488 489 # https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions#percentile_cont 490 COERCES_TO = { 491 **TypeAnnotator.COERCES_TO, 492 exp.DataType.Type.BIGDECIMAL: {exp.DataType.Type.DOUBLE}, 493 } 494 COERCES_TO[exp.DataType.Type.DECIMAL] |= {exp.DataType.Type.BIGDECIMAL} 495 COERCES_TO[exp.DataType.Type.BIGINT] |= {exp.DataType.Type.BIGDECIMAL} 496 497 # BigQuery maps Type.TIMESTAMP to DATETIME, so we need to amend the inferred types 498 TYPE_TO_EXPRESSIONS = { 499 **Dialect.TYPE_TO_EXPRESSIONS, 500 exp.DataType.Type.TIMESTAMPTZ: Dialect.TYPE_TO_EXPRESSIONS[exp.DataType.Type.TIMESTAMP], 501 } 502 TYPE_TO_EXPRESSIONS.pop(exp.DataType.Type.TIMESTAMP) 503 504 ANNOTATORS = { 505 **Dialect.ANNOTATORS, 506 **{ 507 expr_type: annotate_with_type_lambda(data_type) 508 for data_type, expressions in TYPE_TO_EXPRESSIONS.items() 509 for expr_type in expressions 510 }, 511 **{ 512 expr_type: lambda self, e: _annotate_math_functions(self, e) 513 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 514 }, 515 **{ 516 expr_type: lambda self, e: self._annotate_by_args(e, "this") 517 for expr_type in ( 518 exp.Abs, 519 exp.ArgMax, 520 exp.ArgMin, 521 exp.DateTrunc, 522 exp.DatetimeTrunc, 523 exp.FirstValue, 524 exp.GroupConcat, 525 exp.IgnoreNulls, 526 exp.JSONExtract, 527 exp.Lead, 528 exp.Left, 529 exp.Lower, 530 exp.NthValue, 531 exp.Pad, 532 exp.PercentileDisc, 533 exp.RegexpExtract, 534 exp.RegexpReplace, 535 exp.Repeat, 536 exp.Replace, 537 exp.RespectNulls, 538 exp.Reverse, 539 exp.Right, 540 exp.SafeNegate, 541 exp.Sign, 542 exp.Substring, 543 exp.TimestampTrunc, 544 exp.Translate, 545 exp.Trim, 546 exp.Upper, 547 ) 548 }, 549 exp.Acos: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 550 exp.Acosh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 551 exp.Asin: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 552 exp.Asinh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 553 exp.Atan: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 554 exp.Atanh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 555 exp.Atan2: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 556 exp.ApproxTopSum: lambda self, e: _annotate_by_args_approx_top(self, e), 557 exp.ApproxTopK: lambda self, e: _annotate_by_args_approx_top(self, e), 558 exp.ApproxQuantiles: lambda self, e: self._annotate_by_args(e, "this", array=True), 559 exp.Array: _annotate_array, 560 exp.ArrayConcat: lambda self, e: self._annotate_by_args(e, "this", "expressions"), 561 exp.Ascii: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 562 exp.BitwiseAndAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 563 exp.BitwiseOrAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 564 exp.BitwiseXorAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 565 exp.BitwiseCountAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 566 exp.ByteLength: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 567 exp.ByteString: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 568 exp.Cbrt: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 569 exp.CodePointsToBytes: lambda self, e: self._annotate_with_type( 570 e, exp.DataType.Type.BINARY 571 ), 572 exp.CodePointsToString: lambda self, e: self._annotate_with_type( 573 e, exp.DataType.Type.VARCHAR 574 ), 575 exp.Concat: _annotate_concat, 576 exp.Corr: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 577 exp.Cot: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 578 exp.CosineDistance: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 579 exp.Coth: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 580 exp.CovarPop: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 581 exp.CovarSamp: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 582 exp.Csc: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 583 exp.Csch: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 584 exp.CumeDist: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 585 exp.DateFromUnixDate: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DATE), 586 exp.DenseRank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 587 exp.EuclideanDistance: lambda self, e: self._annotate_with_type( 588 e, exp.DataType.Type.DOUBLE 589 ), 590 exp.FarmFingerprint: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 591 exp.Unhex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 592 exp.Float64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 593 exp.Format: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 594 exp.GenerateTimestampArray: lambda self, e: self._annotate_with_type( 595 e, exp.DataType.build("ARRAY<TIMESTAMP>", dialect="bigquery") 596 ), 597 exp.Grouping: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 598 exp.IsInf: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 599 exp.IsNan: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 600 exp.JSONArray: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 601 exp.JSONArrayAppend: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 602 exp.JSONArrayInsert: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 603 exp.JSONBool: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 604 exp.JSONExtractScalar: lambda self, e: self._annotate_with_type( 605 e, exp.DataType.Type.VARCHAR 606 ), 607 exp.JSONExtractArray: lambda self, e: self._annotate_by_args(e, "this", array=True), 608 exp.JSONFormat: lambda self, e: self._annotate_with_type( 609 e, exp.DataType.Type.JSON if e.args.get("to_json") else exp.DataType.Type.VARCHAR 610 ), 611 exp.JSONKeysAtDepth: lambda self, e: self._annotate_with_type( 612 e, exp.DataType.build("ARRAY<VARCHAR>", dialect="bigquery") 613 ), 614 exp.JSONObject: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 615 exp.JSONRemove: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 616 exp.JSONSet: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 617 exp.JSONStripNulls: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 618 exp.JSONType: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 619 exp.JSONValueArray: lambda self, e: self._annotate_with_type( 620 e, exp.DataType.build("ARRAY<VARCHAR>", dialect="bigquery") 621 ), 622 exp.Lag: lambda self, e: self._annotate_by_args(e, "this", "default"), 623 exp.LowerHex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 624 exp.LaxBool: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 625 exp.LaxFloat64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 626 exp.LaxInt64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 627 exp.LaxString: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 628 exp.MD5Digest: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 629 exp.Normalize: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 630 exp.Ntile: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 631 exp.ParseTime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 632 exp.ParseDatetime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DATETIME), 633 exp.ParseBignumeric: lambda self, e: self._annotate_with_type( 634 e, exp.DataType.Type.BIGDECIMAL 635 ), 636 exp.ParseNumeric: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DECIMAL), 637 exp.PercentileCont: lambda self, e: _annotate_by_args_with_coerce(self, e), 638 exp.PercentRank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 639 exp.Rank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 640 exp.RangeBucket: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 641 exp.RegexpExtractAll: lambda self, e: self._annotate_by_args(e, "this", array=True), 642 exp.RegexpInstr: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 643 exp.RowNumber: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 644 exp.Rand: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 645 exp.SafeConvertBytesToString: lambda self, e: self._annotate_with_type( 646 e, exp.DataType.Type.VARCHAR 647 ), 648 exp.SafeAdd: lambda self, e: _annotate_by_args_with_coerce(self, e), 649 exp.SafeMultiply: lambda self, e: _annotate_by_args_with_coerce(self, e), 650 exp.SafeSubtract: lambda self, e: _annotate_by_args_with_coerce(self, e), 651 exp.Sec: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 652 exp.Sech: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 653 exp.Soundex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 654 exp.SHA: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 655 exp.SHA2: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 656 exp.Sin: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 657 exp.Sinh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 658 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 659 exp.TimestampFromParts: lambda self, e: self._annotate_with_type( 660 e, exp.DataType.Type.DATETIME 661 ), 662 exp.TimeFromParts: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 663 exp.TimeTrunc: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 664 exp.ToCodePoints: lambda self, e: self._annotate_with_type( 665 e, exp.DataType.build("ARRAY<BIGINT>", dialect="bigquery") 666 ), 667 exp.TsOrDsToTime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 668 exp.Unicode: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 669 exp.Uuid: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 670 } 671 672 def normalize_identifier(self, expression: E) -> E: 673 if ( 674 isinstance(expression, exp.Identifier) 675 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 676 ): 677 parent = expression.parent 678 while isinstance(parent, exp.Dot): 679 parent = parent.parent 680 681 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 682 # by default. The following check uses a heuristic to detect tables based on whether 683 # they are qualified. This should generally be correct, because tables in BigQuery 684 # must be qualified with at least a dataset, unless @@dataset_id is set. 685 case_sensitive = ( 686 isinstance(parent, exp.UserDefinedFunction) 687 or ( 688 isinstance(parent, exp.Table) 689 and parent.db 690 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 691 ) 692 or expression.meta.get("is_table") 693 ) 694 if not case_sensitive: 695 expression.set("this", expression.this.lower()) 696 697 return t.cast(E, expression) 698 699 return super().normalize_identifier(expression) 700 701 class JSONPathTokenizer(jsonpath.JSONPathTokenizer): 702 VAR_TOKENS = { 703 TokenType.DASH, 704 TokenType.VAR, 705 } 706 707 class Tokenizer(tokens.Tokenizer): 708 QUOTES = ["'", '"', '"""', "'''"] 709 COMMENTS = ["--", "#", ("/*", "*/")] 710 IDENTIFIERS = ["`"] 711 STRING_ESCAPES = ["\\"] 712 713 HEX_STRINGS = [("0x", ""), ("0X", "")] 714 715 BYTE_STRINGS = [ 716 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 717 ] 718 719 RAW_STRINGS = [ 720 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 721 ] 722 723 NESTED_COMMENTS = False 724 725 KEYWORDS = { 726 **tokens.Tokenizer.KEYWORDS, 727 "ANY TYPE": TokenType.VARIANT, 728 "BEGIN": TokenType.COMMAND, 729 "BEGIN TRANSACTION": TokenType.BEGIN, 730 "BYTEINT": TokenType.INT, 731 "BYTES": TokenType.BINARY, 732 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 733 "DATETIME": TokenType.TIMESTAMP, 734 "DECLARE": TokenType.DECLARE, 735 "ELSEIF": TokenType.COMMAND, 736 "EXCEPTION": TokenType.COMMAND, 737 "EXPORT": TokenType.EXPORT, 738 "FLOAT64": TokenType.DOUBLE, 739 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 740 "LOOP": TokenType.COMMAND, 741 "MODEL": TokenType.MODEL, 742 "NOT DETERMINISTIC": TokenType.VOLATILE, 743 "RECORD": TokenType.STRUCT, 744 "REPEAT": TokenType.COMMAND, 745 "TIMESTAMP": TokenType.TIMESTAMPTZ, 746 "WHILE": TokenType.COMMAND, 747 } 748 KEYWORDS.pop("DIV") 749 KEYWORDS.pop("VALUES") 750 KEYWORDS.pop("/*+") 751 752 class Parser(parser.Parser): 753 PREFIXED_PIVOT_COLUMNS = True 754 LOG_DEFAULTS_TO_LN = True 755 SUPPORTS_IMPLICIT_UNNEST = True 756 JOINS_HAVE_EQUAL_PRECEDENCE = True 757 758 # BigQuery does not allow ASC/DESC to be used as an identifier, allows GRANT as an identifier 759 ID_VAR_TOKENS = { 760 *parser.Parser.ID_VAR_TOKENS, 761 TokenType.GRANT, 762 } - {TokenType.ASC, TokenType.DESC} 763 764 ALIAS_TOKENS = { 765 *parser.Parser.ALIAS_TOKENS, 766 TokenType.GRANT, 767 } - {TokenType.ASC, TokenType.DESC} 768 769 TABLE_ALIAS_TOKENS = { 770 *parser.Parser.TABLE_ALIAS_TOKENS, 771 TokenType.GRANT, 772 } - {TokenType.ASC, TokenType.DESC} 773 774 COMMENT_TABLE_ALIAS_TOKENS = { 775 *parser.Parser.COMMENT_TABLE_ALIAS_TOKENS, 776 TokenType.GRANT, 777 } - {TokenType.ASC, TokenType.DESC} 778 779 UPDATE_ALIAS_TOKENS = { 780 *parser.Parser.UPDATE_ALIAS_TOKENS, 781 TokenType.GRANT, 782 } - {TokenType.ASC, TokenType.DESC} 783 784 FUNCTIONS = { 785 **parser.Parser.FUNCTIONS, 786 "APPROX_TOP_COUNT": exp.ApproxTopK.from_arg_list, 787 "BIT_AND": exp.BitwiseAndAgg.from_arg_list, 788 "BIT_OR": exp.BitwiseOrAgg.from_arg_list, 789 "BIT_XOR": exp.BitwiseXorAgg.from_arg_list, 790 "BIT_COUNT": exp.BitwiseCountAgg.from_arg_list, 791 "BOOL": exp.JSONBool.from_arg_list, 792 "CONTAINS_SUBSTR": _build_contains_substring, 793 "DATE": _build_date, 794 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 795 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 796 "DATE_TRUNC": lambda args: exp.DateTrunc( 797 unit=seq_get(args, 1), 798 this=seq_get(args, 0), 799 zone=seq_get(args, 2), 800 ), 801 "DATETIME": _build_datetime, 802 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 803 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 804 "DIV": binary_from_function(exp.IntDiv), 805 "EDIT_DISTANCE": _build_levenshtein, 806 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 807 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 808 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 809 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 810 "JSON_EXTRACT_STRING_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 811 "JSON_KEYS": exp.JSONKeysAtDepth.from_arg_list, 812 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 813 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 814 "JSON_STRIP_NULLS": _build_json_strip_nulls, 815 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 816 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 817 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 818 "MD5": exp.MD5Digest.from_arg_list, 819 "NORMALIZE_AND_CASEFOLD": lambda args: exp.Normalize( 820 this=seq_get(args, 0), form=seq_get(args, 1), is_casefold=True 821 ), 822 "OCTET_LENGTH": exp.ByteLength.from_arg_list, 823 "TO_HEX": _build_to_hex, 824 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 825 [seq_get(args, 1), seq_get(args, 0)] 826 ), 827 "PARSE_TIME": lambda args: build_formatted_time(exp.ParseTime, "bigquery")( 828 [seq_get(args, 1), seq_get(args, 0)] 829 ), 830 "PARSE_TIMESTAMP": _build_parse_timestamp, 831 "PARSE_DATETIME": lambda args: build_formatted_time(exp.ParseDatetime, "bigquery")( 832 [seq_get(args, 1), seq_get(args, 0)] 833 ), 834 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 835 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 836 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 837 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 838 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 839 ), 840 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 841 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 842 "SPLIT": lambda args: exp.Split( 843 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 844 this=seq_get(args, 0), 845 expression=seq_get(args, 1) or exp.Literal.string(","), 846 ), 847 "STRPOS": exp.StrPosition.from_arg_list, 848 "TIME": _build_time, 849 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 850 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 851 "TIMESTAMP": _build_timestamp, 852 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 853 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 854 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 855 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 856 ), 857 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 858 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 859 ), 860 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 861 "TO_JSON": lambda args: exp.JSONFormat( 862 this=seq_get(args, 0), options=seq_get(args, 1), to_json=True 863 ), 864 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 865 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 866 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 867 "FORMAT_TIME": _build_format_time(exp.TsOrDsToTime), 868 "FROM_HEX": exp.Unhex.from_arg_list, 869 "WEEK": lambda args: exp.WeekStart(this=exp.var(seq_get(args, 0))), 870 } 871 872 FUNCTION_PARSERS = { 873 **parser.Parser.FUNCTION_PARSERS, 874 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 875 "JSON_ARRAY": lambda self: self.expression( 876 exp.JSONArray, expressions=self._parse_csv(self._parse_bitwise) 877 ), 878 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 879 "PREDICT": lambda self: self._parse_ml(exp.Predict), 880 "TRANSLATE": lambda self: self._parse_translate(), 881 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 882 "GENERATE_EMBEDDING": lambda self: self._parse_ml(exp.GenerateEmbedding), 883 "GENERATE_TEXT_EMBEDDING": lambda self: self._parse_ml( 884 exp.GenerateEmbedding, is_text=True 885 ), 886 "VECTOR_SEARCH": lambda self: self._parse_vector_search(), 887 "FORECAST": lambda self: self._parse_ml(exp.MLForecast), 888 } 889 FUNCTION_PARSERS.pop("TRIM") 890 891 NO_PAREN_FUNCTIONS = { 892 **parser.Parser.NO_PAREN_FUNCTIONS, 893 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 894 } 895 896 NESTED_TYPE_TOKENS = { 897 *parser.Parser.NESTED_TYPE_TOKENS, 898 TokenType.TABLE, 899 } 900 901 PROPERTY_PARSERS = { 902 **parser.Parser.PROPERTY_PARSERS, 903 "NOT DETERMINISTIC": lambda self: self.expression( 904 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 905 ), 906 "OPTIONS": lambda self: self._parse_with_property(), 907 } 908 909 CONSTRAINT_PARSERS = { 910 **parser.Parser.CONSTRAINT_PARSERS, 911 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 912 } 913 914 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 915 RANGE_PARSERS.pop(TokenType.OVERLAPS) 916 917 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 918 919 STATEMENT_PARSERS = { 920 **parser.Parser.STATEMENT_PARSERS, 921 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 922 TokenType.END: lambda self: self._parse_as_command(self._prev), 923 TokenType.FOR: lambda self: self._parse_for_in(), 924 TokenType.EXPORT: lambda self: self._parse_export_data(), 925 TokenType.DECLARE: lambda self: self._parse_declare(), 926 } 927 928 BRACKET_OFFSETS = { 929 "OFFSET": (0, False), 930 "ORDINAL": (1, False), 931 "SAFE_OFFSET": (0, True), 932 "SAFE_ORDINAL": (1, True), 933 } 934 935 def _parse_for_in(self) -> t.Union[exp.ForIn, exp.Command]: 936 index = self._index 937 this = self._parse_range() 938 self._match_text_seq("DO") 939 if self._match(TokenType.COMMAND): 940 self._retreat(index) 941 return self._parse_as_command(self._prev) 942 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 943 944 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 945 this = super()._parse_table_part(schema=schema) or self._parse_number() 946 947 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 948 if isinstance(this, exp.Identifier): 949 table_name = this.name 950 while self._match(TokenType.DASH, advance=False) and self._next: 951 start = self._curr 952 while self._is_connected() and not self._match_set( 953 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 954 ): 955 self._advance() 956 957 if start == self._curr: 958 break 959 960 table_name += self._find_sql(start, self._prev) 961 962 this = exp.Identifier( 963 this=table_name, quoted=this.args.get("quoted") 964 ).update_positions(this) 965 elif isinstance(this, exp.Literal): 966 table_name = this.name 967 968 if self._is_connected() and self._parse_var(any_token=True): 969 table_name += self._prev.text 970 971 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 972 973 return this 974 975 def _parse_table_parts( 976 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 977 ) -> exp.Table: 978 table = super()._parse_table_parts( 979 schema=schema, is_db_reference=is_db_reference, wildcard=True 980 ) 981 982 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 983 if not table.catalog: 984 if table.db: 985 previous_db = table.args["db"] 986 parts = table.db.split(".") 987 if len(parts) == 2 and not table.args["db"].quoted: 988 table.set( 989 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 990 ) 991 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 992 else: 993 previous_this = table.this 994 parts = table.name.split(".") 995 if len(parts) == 2 and not table.this.quoted: 996 table.set( 997 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 998 ) 999 table.set( 1000 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 1001 ) 1002 1003 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 1004 alias = table.this 1005 catalog, db, this, *rest = ( 1006 exp.to_identifier(p, quoted=True) 1007 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 1008 ) 1009 1010 for part in (catalog, db, this): 1011 if part: 1012 part.update_positions(table.this) 1013 1014 if rest and this: 1015 this = exp.Dot.build([this, *rest]) # type: ignore 1016 1017 table = exp.Table( 1018 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 1019 ) 1020 table.meta["quoted_table"] = True 1021 else: 1022 alias = None 1023 1024 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 1025 # dataset, so if the project identifier is omitted we need to fix the ast so that 1026 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 1027 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 1028 # views, because it would seem like the "catalog" part is set, when it'd actually 1029 # be the region/dataset. Merging the two identifiers into a single one is done to 1030 # avoid producing a 4-part Table reference, which would cause issues in the schema 1031 # module, when there are 3-part table names mixed with information schema views. 1032 # 1033 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 1034 table_parts = table.parts 1035 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 1036 # We need to alias the table here to avoid breaking existing qualified columns. 1037 # This is expected to be safe, because if there's an actual alias coming up in 1038 # the token stream, it will overwrite this one. If there isn't one, we are only 1039 # exposing the name that can be used to reference the view explicitly (a no-op). 1040 exp.alias_( 1041 table, 1042 t.cast(exp.Identifier, alias or table_parts[-1]), 1043 table=True, 1044 copy=False, 1045 ) 1046 1047 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 1048 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 1049 line=table_parts[-2].meta.get("line"), 1050 col=table_parts[-1].meta.get("col"), 1051 start=table_parts[-2].meta.get("start"), 1052 end=table_parts[-1].meta.get("end"), 1053 ) 1054 table.set("this", new_this) 1055 table.set("db", seq_get(table_parts, -3)) 1056 table.set("catalog", seq_get(table_parts, -4)) 1057 1058 return table 1059 1060 def _parse_column(self) -> t.Optional[exp.Expression]: 1061 column = super()._parse_column() 1062 if isinstance(column, exp.Column): 1063 parts = column.parts 1064 if any("." in p.name for p in parts): 1065 catalog, db, table, this, *rest = ( 1066 exp.to_identifier(p, quoted=True) 1067 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 1068 ) 1069 1070 if rest and this: 1071 this = exp.Dot.build([this, *rest]) # type: ignore 1072 1073 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 1074 column.meta["quoted_column"] = True 1075 1076 return column 1077 1078 @t.overload 1079 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 1080 1081 @t.overload 1082 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 1083 1084 def _parse_json_object(self, agg=False): 1085 json_object = super()._parse_json_object() 1086 array_kv_pair = seq_get(json_object.expressions, 0) 1087 1088 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 1089 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 1090 if ( 1091 array_kv_pair 1092 and isinstance(array_kv_pair.this, exp.Array) 1093 and isinstance(array_kv_pair.expression, exp.Array) 1094 ): 1095 keys = array_kv_pair.this.expressions 1096 values = array_kv_pair.expression.expressions 1097 1098 json_object.set( 1099 "expressions", 1100 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 1101 ) 1102 1103 return json_object 1104 1105 def _parse_bracket( 1106 self, this: t.Optional[exp.Expression] = None 1107 ) -> t.Optional[exp.Expression]: 1108 bracket = super()._parse_bracket(this) 1109 1110 if this is bracket: 1111 return bracket 1112 1113 if isinstance(bracket, exp.Bracket): 1114 for expression in bracket.expressions: 1115 name = expression.name.upper() 1116 1117 if name not in self.BRACKET_OFFSETS: 1118 break 1119 1120 offset, safe = self.BRACKET_OFFSETS[name] 1121 bracket.set("offset", offset) 1122 bracket.set("safe", safe) 1123 expression.replace(expression.expressions[0]) 1124 1125 return bracket 1126 1127 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 1128 unnest = super()._parse_unnest(with_alias=with_alias) 1129 1130 if not unnest: 1131 return None 1132 1133 unnest_expr = seq_get(unnest.expressions, 0) 1134 if unnest_expr: 1135 from sqlglot.optimizer.annotate_types import annotate_types 1136 1137 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 1138 1139 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 1140 # in contrast to other dialects such as DuckDB which flattens only the array by default 1141 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 1142 array_elem.is_type(exp.DataType.Type.STRUCT) 1143 for array_elem in unnest_expr._type.expressions 1144 ): 1145 unnest.set("explode_array", True) 1146 1147 return unnest 1148 1149 def _parse_make_interval(self) -> exp.MakeInterval: 1150 expr = exp.MakeInterval() 1151 1152 for arg_key in expr.arg_types: 1153 value = self._parse_lambda() 1154 1155 if not value: 1156 break 1157 1158 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 1159 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 1160 if isinstance(value, exp.Kwarg): 1161 arg_key = value.this.name 1162 1163 expr.set(arg_key, value) 1164 1165 self._match(TokenType.COMMA) 1166 1167 return expr 1168 1169 def _parse_ml(self, expr_type: t.Type[E], **kwargs) -> E: 1170 self._match_text_seq("MODEL") 1171 this = self._parse_table() 1172 1173 self._match(TokenType.COMMA) 1174 self._match_text_seq("TABLE") 1175 1176 # Certain functions like ML.FORECAST require a STRUCT argument but not a TABLE/SELECT one 1177 expression = ( 1178 self._parse_table() if not self._match(TokenType.STRUCT, advance=False) else None 1179 ) 1180 1181 self._match(TokenType.COMMA) 1182 1183 return self.expression( 1184 expr_type, 1185 this=this, 1186 expression=expression, 1187 params_struct=self._parse_bitwise(), 1188 **kwargs, 1189 ) 1190 1191 def _parse_translate(self) -> exp.Translate | exp.MLTranslate: 1192 # Check if this is ML.TRANSLATE by looking at previous tokens 1193 token = seq_get(self._tokens, self._index - 4) 1194 if token and token.text.upper() == "ML": 1195 return self._parse_ml(exp.MLTranslate) 1196 1197 return exp.Translate.from_arg_list(self._parse_function_args()) 1198 1199 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 1200 self._match(TokenType.TABLE) 1201 this = self._parse_table() 1202 1203 expr = self.expression(exp.FeaturesAtTime, this=this) 1204 1205 while self._match(TokenType.COMMA): 1206 arg = self._parse_lambda() 1207 1208 # Get the LHS of the Kwarg and set the arg to that value, e.g 1209 # "num_rows => 1" sets the expr's `num_rows` arg 1210 if arg: 1211 expr.set(arg.this.name, arg) 1212 1213 return expr 1214 1215 def _parse_vector_search(self) -> exp.VectorSearch: 1216 self._match(TokenType.TABLE) 1217 base_table = self._parse_table() 1218 1219 self._match(TokenType.COMMA) 1220 1221 column_to_search = self._parse_bitwise() 1222 self._match(TokenType.COMMA) 1223 1224 self._match(TokenType.TABLE) 1225 query_table = self._parse_table() 1226 1227 expr = self.expression( 1228 exp.VectorSearch, 1229 this=base_table, 1230 column_to_search=column_to_search, 1231 query_table=query_table, 1232 ) 1233 1234 while self._match(TokenType.COMMA): 1235 # query_column_to_search can be named argument or positional 1236 if self._match(TokenType.STRING, advance=False): 1237 query_column = self._parse_string() 1238 expr.set("query_column_to_search", query_column) 1239 else: 1240 arg = self._parse_lambda() 1241 if arg: 1242 expr.set(arg.this.name, arg) 1243 1244 return expr 1245 1246 def _parse_export_data(self) -> exp.Export: 1247 self._match_text_seq("DATA") 1248 1249 return self.expression( 1250 exp.Export, 1251 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 1252 options=self._parse_properties(), 1253 this=self._match_text_seq("AS") and self._parse_select(), 1254 ) 1255 1256 class Generator(generator.Generator): 1257 INTERVAL_ALLOWS_PLURAL_FORM = False 1258 JOIN_HINTS = False 1259 QUERY_HINTS = False 1260 TABLE_HINTS = False 1261 LIMIT_FETCH = "LIMIT" 1262 RENAME_TABLE_WITH_DB = False 1263 NVL2_SUPPORTED = False 1264 UNNEST_WITH_ORDINALITY = False 1265 COLLATE_IS_FUNC = True 1266 LIMIT_ONLY_LITERALS = True 1267 SUPPORTS_TABLE_ALIAS_COLUMNS = False 1268 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 1269 JSON_KEY_VALUE_PAIR_SEP = "," 1270 NULL_ORDERING_SUPPORTED = False 1271 IGNORE_NULLS_IN_FUNC = True 1272 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 1273 CAN_IMPLEMENT_ARRAY_ANY = True 1274 SUPPORTS_TO_NUMBER = False 1275 NAMED_PLACEHOLDER_TOKEN = "@" 1276 HEX_FUNC = "TO_HEX" 1277 WITH_PROPERTIES_PREFIX = "OPTIONS" 1278 SUPPORTS_EXPLODING_PROJECTIONS = False 1279 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 1280 SUPPORTS_UNIX_SECONDS = True 1281 1282 SAFE_JSON_PATH_KEY_RE = re.compile(r"^[_\-a-zA-Z][\-\w]*$") 1283 1284 TS_OR_DS_TYPES = ( 1285 exp.TsOrDsToDatetime, 1286 exp.TsOrDsToTimestamp, 1287 exp.TsOrDsToTime, 1288 exp.TsOrDsToDate, 1289 ) 1290 1291 TRANSFORMS = { 1292 **generator.Generator.TRANSFORMS, 1293 exp.ApproxTopK: rename_func("APPROX_TOP_COUNT"), 1294 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 1295 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 1296 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 1297 exp.Array: inline_array_unless_query, 1298 exp.ArrayContains: _array_contains_sql, 1299 exp.ArrayFilter: filter_array_using_unnest, 1300 exp.ArrayRemove: filter_array_using_unnest, 1301 exp.BitwiseAndAgg: rename_func("BIT_AND"), 1302 exp.BitwiseOrAgg: rename_func("BIT_OR"), 1303 exp.BitwiseXorAgg: rename_func("BIT_XOR"), 1304 exp.BitwiseCountAgg: rename_func("BIT_COUNT"), 1305 exp.ByteLength: rename_func("BYTE_LENGTH"), 1306 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 1307 exp.CollateProperty: lambda self, e: ( 1308 f"DEFAULT COLLATE {self.sql(e, 'this')}" 1309 if e.args.get("default") 1310 else f"COLLATE {self.sql(e, 'this')}" 1311 ), 1312 exp.Commit: lambda *_: "COMMIT TRANSACTION", 1313 exp.CountIf: rename_func("COUNTIF"), 1314 exp.Create: _create_sql, 1315 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 1316 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 1317 exp.DateDiff: lambda self, e: self.func( 1318 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 1319 ), 1320 exp.DateFromParts: rename_func("DATE"), 1321 exp.DateStrToDate: datestrtodate_sql, 1322 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 1323 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 1324 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 1325 exp.DateFromUnixDate: rename_func("DATE_FROM_UNIX_DATE"), 1326 exp.FromTimeZone: lambda self, e: self.func( 1327 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 1328 ), 1329 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 1330 exp.GroupConcat: lambda self, e: groupconcat_sql( 1331 self, e, func_name="STRING_AGG", within_group=False 1332 ), 1333 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 1334 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 1335 exp.If: if_sql(false_value="NULL"), 1336 exp.ILike: no_ilike_sql, 1337 exp.IntDiv: rename_func("DIV"), 1338 exp.Int64: rename_func("INT64"), 1339 exp.JSONBool: rename_func("BOOL"), 1340 exp.JSONExtract: _json_extract_sql, 1341 exp.JSONExtractArray: _json_extract_sql, 1342 exp.JSONExtractScalar: _json_extract_sql, 1343 exp.JSONFormat: lambda self, e: self.func( 1344 "TO_JSON" if e.args.get("to_json") else "TO_JSON_STRING", 1345 e.this, 1346 e.args.get("options"), 1347 ), 1348 exp.JSONKeysAtDepth: rename_func("JSON_KEYS"), 1349 exp.JSONValueArray: rename_func("JSON_VALUE_ARRAY"), 1350 exp.Levenshtein: _levenshtein_sql, 1351 exp.Max: max_or_greatest, 1352 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 1353 exp.MD5Digest: rename_func("MD5"), 1354 exp.Min: min_or_least, 1355 exp.Normalize: lambda self, e: self.func( 1356 "NORMALIZE_AND_CASEFOLD" if e.args.get("is_casefold") else "NORMALIZE", 1357 e.this, 1358 e.args.get("form"), 1359 ), 1360 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1361 exp.RegexpExtract: lambda self, e: self.func( 1362 "REGEXP_EXTRACT", 1363 e.this, 1364 e.expression, 1365 e.args.get("position"), 1366 e.args.get("occurrence"), 1367 ), 1368 exp.RegexpExtractAll: lambda self, e: self.func( 1369 "REGEXP_EXTRACT_ALL", e.this, e.expression 1370 ), 1371 exp.RegexpReplace: regexp_replace_sql, 1372 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1373 exp.ReturnsProperty: _returnsproperty_sql, 1374 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1375 exp.ParseTime: lambda self, e: self.func("PARSE_TIME", self.format_time(e), e.this), 1376 exp.ParseDatetime: lambda self, e: self.func( 1377 "PARSE_DATETIME", self.format_time(e), e.this 1378 ), 1379 exp.Select: transforms.preprocess( 1380 [ 1381 transforms.explode_projection_to_unnest(), 1382 transforms.unqualify_unnest, 1383 transforms.eliminate_distinct_on, 1384 _alias_ordered_group, 1385 transforms.eliminate_semi_and_anti_joins, 1386 ] 1387 ), 1388 exp.SHA: rename_func("SHA1"), 1389 exp.SHA2: sha256_sql, 1390 exp.StabilityProperty: lambda self, e: ( 1391 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1392 ), 1393 exp.String: rename_func("STRING"), 1394 exp.StrPosition: lambda self, e: ( 1395 strposition_sql( 1396 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1397 ) 1398 ), 1399 exp.StrToDate: _str_to_datetime_sql, 1400 exp.StrToTime: _str_to_datetime_sql, 1401 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1402 exp.TimeFromParts: rename_func("TIME"), 1403 exp.TimestampFromParts: rename_func("DATETIME"), 1404 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1405 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1406 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1407 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1408 exp.TimeStrToTime: timestrtotime_sql, 1409 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1410 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1411 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1412 exp.TsOrDsToTime: rename_func("TIME"), 1413 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1414 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1415 exp.Unhex: rename_func("FROM_HEX"), 1416 exp.UnixDate: rename_func("UNIX_DATE"), 1417 exp.UnixToTime: _unix_to_time_sql, 1418 exp.Uuid: lambda *_: "GENERATE_UUID()", 1419 exp.Values: _derived_table_values_to_unnest, 1420 exp.VariancePop: rename_func("VAR_POP"), 1421 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1422 } 1423 1424 SUPPORTED_JSON_PATH_PARTS = { 1425 exp.JSONPathKey, 1426 exp.JSONPathRoot, 1427 exp.JSONPathSubscript, 1428 } 1429 1430 TYPE_MAPPING = { 1431 **generator.Generator.TYPE_MAPPING, 1432 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1433 exp.DataType.Type.BIGINT: "INT64", 1434 exp.DataType.Type.BINARY: "BYTES", 1435 exp.DataType.Type.BLOB: "BYTES", 1436 exp.DataType.Type.BOOLEAN: "BOOL", 1437 exp.DataType.Type.CHAR: "STRING", 1438 exp.DataType.Type.DECIMAL: "NUMERIC", 1439 exp.DataType.Type.DOUBLE: "FLOAT64", 1440 exp.DataType.Type.FLOAT: "FLOAT64", 1441 exp.DataType.Type.INT: "INT64", 1442 exp.DataType.Type.NCHAR: "STRING", 1443 exp.DataType.Type.NVARCHAR: "STRING", 1444 exp.DataType.Type.SMALLINT: "INT64", 1445 exp.DataType.Type.TEXT: "STRING", 1446 exp.DataType.Type.TIMESTAMP: "DATETIME", 1447 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1448 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1449 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1450 exp.DataType.Type.TINYINT: "INT64", 1451 exp.DataType.Type.ROWVERSION: "BYTES", 1452 exp.DataType.Type.UUID: "STRING", 1453 exp.DataType.Type.VARBINARY: "BYTES", 1454 exp.DataType.Type.VARCHAR: "STRING", 1455 exp.DataType.Type.VARIANT: "ANY TYPE", 1456 } 1457 1458 PROPERTIES_LOCATION = { 1459 **generator.Generator.PROPERTIES_LOCATION, 1460 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1461 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1462 } 1463 1464 # WINDOW comes after QUALIFY 1465 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1466 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1467 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1468 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1469 } 1470 1471 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1472 RESERVED_KEYWORDS = { 1473 "all", 1474 "and", 1475 "any", 1476 "array", 1477 "as", 1478 "asc", 1479 "assert_rows_modified", 1480 "at", 1481 "between", 1482 "by", 1483 "case", 1484 "cast", 1485 "collate", 1486 "contains", 1487 "create", 1488 "cross", 1489 "cube", 1490 "current", 1491 "default", 1492 "define", 1493 "desc", 1494 "distinct", 1495 "else", 1496 "end", 1497 "enum", 1498 "escape", 1499 "except", 1500 "exclude", 1501 "exists", 1502 "extract", 1503 "false", 1504 "fetch", 1505 "following", 1506 "for", 1507 "from", 1508 "full", 1509 "group", 1510 "grouping", 1511 "groups", 1512 "hash", 1513 "having", 1514 "if", 1515 "ignore", 1516 "in", 1517 "inner", 1518 "intersect", 1519 "interval", 1520 "into", 1521 "is", 1522 "join", 1523 "lateral", 1524 "left", 1525 "like", 1526 "limit", 1527 "lookup", 1528 "merge", 1529 "natural", 1530 "new", 1531 "no", 1532 "not", 1533 "null", 1534 "nulls", 1535 "of", 1536 "on", 1537 "or", 1538 "order", 1539 "outer", 1540 "over", 1541 "partition", 1542 "preceding", 1543 "proto", 1544 "qualify", 1545 "range", 1546 "recursive", 1547 "respect", 1548 "right", 1549 "rollup", 1550 "rows", 1551 "select", 1552 "set", 1553 "some", 1554 "struct", 1555 "tablesample", 1556 "then", 1557 "to", 1558 "treat", 1559 "true", 1560 "unbounded", 1561 "union", 1562 "unnest", 1563 "using", 1564 "when", 1565 "where", 1566 "window", 1567 "with", 1568 "within", 1569 } 1570 1571 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1572 unit = expression.unit 1573 unit_sql = unit.name if unit.is_string else self.sql(unit) 1574 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1575 1576 def mod_sql(self, expression: exp.Mod) -> str: 1577 this = expression.this 1578 expr = expression.expression 1579 return self.func( 1580 "MOD", 1581 this.unnest() if isinstance(this, exp.Paren) else this, 1582 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1583 ) 1584 1585 def column_parts(self, expression: exp.Column) -> str: 1586 if expression.meta.get("quoted_column"): 1587 # If a column reference is of the form `dataset.table`.name, we need 1588 # to preserve the quoted table path, otherwise the reference breaks 1589 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1590 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1591 return f"{table_path}.{self.sql(expression, 'this')}" 1592 1593 return super().column_parts(expression) 1594 1595 def table_parts(self, expression: exp.Table) -> str: 1596 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1597 # we need to make sure the correct quoting is used in each case. 1598 # 1599 # For example, if there is a CTE x that clashes with a schema name, then the former will 1600 # return the table y in that schema, whereas the latter will return the CTE's y column: 1601 # 1602 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1603 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1604 if expression.meta.get("quoted_table"): 1605 table_parts = ".".join(p.name for p in expression.parts) 1606 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1607 1608 return super().table_parts(expression) 1609 1610 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1611 this = expression.this 1612 if isinstance(this, exp.TsOrDsToDatetime): 1613 func_name = "FORMAT_DATETIME" 1614 elif isinstance(this, exp.TsOrDsToTimestamp): 1615 func_name = "FORMAT_TIMESTAMP" 1616 elif isinstance(this, exp.TsOrDsToTime): 1617 func_name = "FORMAT_TIME" 1618 else: 1619 func_name = "FORMAT_DATE" 1620 1621 time_expr = this if isinstance(this, self.TS_OR_DS_TYPES) else expression 1622 return self.func( 1623 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1624 ) 1625 1626 def eq_sql(self, expression: exp.EQ) -> str: 1627 # Operands of = cannot be NULL in BigQuery 1628 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1629 if not isinstance(expression.parent, exp.Update): 1630 return "NULL" 1631 1632 return self.binary(expression, "=") 1633 1634 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1635 parent = expression.parent 1636 1637 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1638 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1639 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1640 return self.func( 1641 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1642 ) 1643 1644 return super().attimezone_sql(expression) 1645 1646 def trycast_sql(self, expression: exp.TryCast) -> str: 1647 return self.cast_sql(expression, safe_prefix="SAFE_") 1648 1649 def bracket_sql(self, expression: exp.Bracket) -> str: 1650 this = expression.this 1651 expressions = expression.expressions 1652 1653 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1654 arg = expressions[0] 1655 if arg.type is None: 1656 from sqlglot.optimizer.annotate_types import annotate_types 1657 1658 arg = annotate_types(arg, dialect=self.dialect) 1659 1660 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1661 # BQ doesn't support bracket syntax with string values for structs 1662 return f"{self.sql(this)}.{arg.name}" 1663 1664 expressions_sql = self.expressions(expression, flat=True) 1665 offset = expression.args.get("offset") 1666 1667 if offset == 0: 1668 expressions_sql = f"OFFSET({expressions_sql})" 1669 elif offset == 1: 1670 expressions_sql = f"ORDINAL({expressions_sql})" 1671 elif offset is not None: 1672 self.unsupported(f"Unsupported array offset: {offset}") 1673 1674 if expression.args.get("safe"): 1675 expressions_sql = f"SAFE_{expressions_sql}" 1676 1677 return f"{self.sql(this)}[{expressions_sql}]" 1678 1679 def in_unnest_op(self, expression: exp.Unnest) -> str: 1680 return self.sql(expression) 1681 1682 def version_sql(self, expression: exp.Version) -> str: 1683 if expression.name == "TIMESTAMP": 1684 expression.set("this", "SYSTEM_TIME") 1685 return super().version_sql(expression) 1686 1687 def contains_sql(self, expression: exp.Contains) -> str: 1688 this = expression.this 1689 expr = expression.expression 1690 1691 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1692 this = this.this 1693 expr = expr.this 1694 1695 return self.func("CONTAINS_SUBSTR", this, expr, expression.args.get("json_scope")) 1696 1697 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1698 this = expression.this 1699 1700 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1701 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1702 # because they aren't literals and so the above syntax is invalid BigQuery. 1703 if isinstance(this, exp.Array): 1704 elem = seq_get(this.expressions, 0) 1705 if not (elem and elem.find(exp.Query)): 1706 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1707 1708 return super().cast_sql(expression, safe_prefix=safe_prefix) 1709 1710 def declareitem_sql(self, expression: exp.DeclareItem) -> str: 1711 variables = self.expressions(expression, "this") 1712 default = self.sql(expression, "default") 1713 default = f" DEFAULT {default}" if default else "" 1714 kind = self.sql(expression, "kind") 1715 kind = f" {kind}" if kind else "" 1716 1717 return f"{variables}{kind}{default}"
First day of the week in DATE_TRUNC(week). Defaults to 0 (Monday). -1 would be Sunday.
Whether the base comes first in the LOG function.
Possible values: True, False, None (two arguments are not supported by LOG)
Whether alias reference expansion (_expand_alias_refs()) should run before column qualification (_qualify_columns()).
For example:
WITH data AS ( SELECT 1 AS id, 2 AS my_id ) SELECT id AS my_id FROM data WHERE my_id = 1 GROUP BY my_id, HAVING my_id = 1
In most dialects, "my_id" would refer to "data.my_id" across the query, except: - BigQuery, which will forward the alias to GROUP BY + HAVING clauses i.e it resolves to "WHERE my_id = 1 GROUP BY id HAVING id = 1" - Clickhouse, which will forward the alias across the query i.e it resolves to "WHERE id = 1 GROUP BY id HAVING id = 1"
Whether the name of the function should be preserved inside the node's metadata, can be useful for roundtripping deprecated vs new functions that share an AST node e.g JSON_VALUE vs JSON_EXTRACT_SCALAR in BigQuery
Whether hex strings such as x'CC' evaluate to integer or binary/blob type
Specifies the strategy according to which identifiers should be normalized.
Determines how function names are going to be normalized.
Possible values:
"upper" or True: Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
Associates this dialect's time formats with their equivalent Python strftime formats.
Helper which is used for parsing the special syntax CAST(x AS DATE FORMAT 'yyyy').
If empty, the corresponding trie will be constructed off of TIME_MAPPING.
Columns that are auto-generated by the engine corresponding to this dialect.
For example, such columns may be excluded from SELECT * queries.
Whether a set operation uses DISTINCT by default. This is None when either DISTINCT or ALL
must be explicitly specified.
672 def normalize_identifier(self, expression: E) -> E: 673 if ( 674 isinstance(expression, exp.Identifier) 675 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 676 ): 677 parent = expression.parent 678 while isinstance(parent, exp.Dot): 679 parent = parent.parent 680 681 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 682 # by default. The following check uses a heuristic to detect tables based on whether 683 # they are qualified. This should generally be correct, because tables in BigQuery 684 # must be qualified with at least a dataset, unless @@dataset_id is set. 685 case_sensitive = ( 686 isinstance(parent, exp.UserDefinedFunction) 687 or ( 688 isinstance(parent, exp.Table) 689 and parent.db 690 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 691 ) 692 or expression.meta.get("is_table") 693 ) 694 if not case_sensitive: 695 expression.set("this", expression.this.lower()) 696 697 return t.cast(E, expression) 698 699 return super().normalize_identifier(expression)
Transforms an identifier in a way that resembles how it'd be resolved by this dialect.
For example, an identifier like FoO would be resolved as foo in Postgres, because it
lowercases all unquoted identifiers. On the other hand, Snowflake uppercases them, so
it would resolve it as FOO. If it was quoted, it'd need to be treated as case-sensitive,
and so any normalization would be prohibited in order to avoid "breaking" the identifier.
There are also dialects like Spark, which are case-insensitive even when quotes are present, and dialects like MySQL, whose resolution rules match those employed by the underlying operating system, for example they may always be case-sensitive in Linux.
Finally, the normalization behavior of some engines can even be controlled through flags, like in Redshift's case, where users can explicitly set enable_case_sensitive_identifier.
SQLGlot aims to understand and handle all of these different behaviors gracefully, so that it can analyze queries in the optimizer and successfully capture their semantics.
Mapping of an escaped sequence (\n) to its unescaped version (
).
701 class JSONPathTokenizer(jsonpath.JSONPathTokenizer): 702 VAR_TOKENS = { 703 TokenType.DASH, 704 TokenType.VAR, 705 }
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- BIT_STRINGS
- BYTE_STRINGS
- HEX_STRINGS
- RAW_STRINGS
- HEREDOC_STRINGS
- UNICODE_STRINGS
- IDENTIFIERS
- QUOTES
- VAR_SINGLE_TOKENS
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- NESTED_COMMENTS
- HINT_START
- TOKENS_PRECEDING_HINT
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- COMMENTS
- dialect
- use_rs_tokenizer
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
707 class Tokenizer(tokens.Tokenizer): 708 QUOTES = ["'", '"', '"""', "'''"] 709 COMMENTS = ["--", "#", ("/*", "*/")] 710 IDENTIFIERS = ["`"] 711 STRING_ESCAPES = ["\\"] 712 713 HEX_STRINGS = [("0x", ""), ("0X", "")] 714 715 BYTE_STRINGS = [ 716 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 717 ] 718 719 RAW_STRINGS = [ 720 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 721 ] 722 723 NESTED_COMMENTS = False 724 725 KEYWORDS = { 726 **tokens.Tokenizer.KEYWORDS, 727 "ANY TYPE": TokenType.VARIANT, 728 "BEGIN": TokenType.COMMAND, 729 "BEGIN TRANSACTION": TokenType.BEGIN, 730 "BYTEINT": TokenType.INT, 731 "BYTES": TokenType.BINARY, 732 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 733 "DATETIME": TokenType.TIMESTAMP, 734 "DECLARE": TokenType.DECLARE, 735 "ELSEIF": TokenType.COMMAND, 736 "EXCEPTION": TokenType.COMMAND, 737 "EXPORT": TokenType.EXPORT, 738 "FLOAT64": TokenType.DOUBLE, 739 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 740 "LOOP": TokenType.COMMAND, 741 "MODEL": TokenType.MODEL, 742 "NOT DETERMINISTIC": TokenType.VOLATILE, 743 "RECORD": TokenType.STRUCT, 744 "REPEAT": TokenType.COMMAND, 745 "TIMESTAMP": TokenType.TIMESTAMPTZ, 746 "WHILE": TokenType.COMMAND, 747 } 748 KEYWORDS.pop("DIV") 749 KEYWORDS.pop("VALUES") 750 KEYWORDS.pop("/*+")
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- SINGLE_TOKENS
- BIT_STRINGS
- HEREDOC_STRINGS
- UNICODE_STRINGS
- VAR_SINGLE_TOKENS
- IDENTIFIER_ESCAPES
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- HINT_START
- TOKENS_PRECEDING_HINT
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- dialect
- use_rs_tokenizer
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
752 class Parser(parser.Parser): 753 PREFIXED_PIVOT_COLUMNS = True 754 LOG_DEFAULTS_TO_LN = True 755 SUPPORTS_IMPLICIT_UNNEST = True 756 JOINS_HAVE_EQUAL_PRECEDENCE = True 757 758 # BigQuery does not allow ASC/DESC to be used as an identifier, allows GRANT as an identifier 759 ID_VAR_TOKENS = { 760 *parser.Parser.ID_VAR_TOKENS, 761 TokenType.GRANT, 762 } - {TokenType.ASC, TokenType.DESC} 763 764 ALIAS_TOKENS = { 765 *parser.Parser.ALIAS_TOKENS, 766 TokenType.GRANT, 767 } - {TokenType.ASC, TokenType.DESC} 768 769 TABLE_ALIAS_TOKENS = { 770 *parser.Parser.TABLE_ALIAS_TOKENS, 771 TokenType.GRANT, 772 } - {TokenType.ASC, TokenType.DESC} 773 774 COMMENT_TABLE_ALIAS_TOKENS = { 775 *parser.Parser.COMMENT_TABLE_ALIAS_TOKENS, 776 TokenType.GRANT, 777 } - {TokenType.ASC, TokenType.DESC} 778 779 UPDATE_ALIAS_TOKENS = { 780 *parser.Parser.UPDATE_ALIAS_TOKENS, 781 TokenType.GRANT, 782 } - {TokenType.ASC, TokenType.DESC} 783 784 FUNCTIONS = { 785 **parser.Parser.FUNCTIONS, 786 "APPROX_TOP_COUNT": exp.ApproxTopK.from_arg_list, 787 "BIT_AND": exp.BitwiseAndAgg.from_arg_list, 788 "BIT_OR": exp.BitwiseOrAgg.from_arg_list, 789 "BIT_XOR": exp.BitwiseXorAgg.from_arg_list, 790 "BIT_COUNT": exp.BitwiseCountAgg.from_arg_list, 791 "BOOL": exp.JSONBool.from_arg_list, 792 "CONTAINS_SUBSTR": _build_contains_substring, 793 "DATE": _build_date, 794 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 795 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 796 "DATE_TRUNC": lambda args: exp.DateTrunc( 797 unit=seq_get(args, 1), 798 this=seq_get(args, 0), 799 zone=seq_get(args, 2), 800 ), 801 "DATETIME": _build_datetime, 802 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 803 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 804 "DIV": binary_from_function(exp.IntDiv), 805 "EDIT_DISTANCE": _build_levenshtein, 806 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 807 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 808 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 809 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 810 "JSON_EXTRACT_STRING_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 811 "JSON_KEYS": exp.JSONKeysAtDepth.from_arg_list, 812 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 813 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 814 "JSON_STRIP_NULLS": _build_json_strip_nulls, 815 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 816 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 817 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 818 "MD5": exp.MD5Digest.from_arg_list, 819 "NORMALIZE_AND_CASEFOLD": lambda args: exp.Normalize( 820 this=seq_get(args, 0), form=seq_get(args, 1), is_casefold=True 821 ), 822 "OCTET_LENGTH": exp.ByteLength.from_arg_list, 823 "TO_HEX": _build_to_hex, 824 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 825 [seq_get(args, 1), seq_get(args, 0)] 826 ), 827 "PARSE_TIME": lambda args: build_formatted_time(exp.ParseTime, "bigquery")( 828 [seq_get(args, 1), seq_get(args, 0)] 829 ), 830 "PARSE_TIMESTAMP": _build_parse_timestamp, 831 "PARSE_DATETIME": lambda args: build_formatted_time(exp.ParseDatetime, "bigquery")( 832 [seq_get(args, 1), seq_get(args, 0)] 833 ), 834 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 835 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 836 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 837 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 838 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 839 ), 840 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 841 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 842 "SPLIT": lambda args: exp.Split( 843 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 844 this=seq_get(args, 0), 845 expression=seq_get(args, 1) or exp.Literal.string(","), 846 ), 847 "STRPOS": exp.StrPosition.from_arg_list, 848 "TIME": _build_time, 849 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 850 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 851 "TIMESTAMP": _build_timestamp, 852 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 853 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 854 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 855 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 856 ), 857 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 858 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 859 ), 860 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 861 "TO_JSON": lambda args: exp.JSONFormat( 862 this=seq_get(args, 0), options=seq_get(args, 1), to_json=True 863 ), 864 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 865 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 866 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 867 "FORMAT_TIME": _build_format_time(exp.TsOrDsToTime), 868 "FROM_HEX": exp.Unhex.from_arg_list, 869 "WEEK": lambda args: exp.WeekStart(this=exp.var(seq_get(args, 0))), 870 } 871 872 FUNCTION_PARSERS = { 873 **parser.Parser.FUNCTION_PARSERS, 874 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 875 "JSON_ARRAY": lambda self: self.expression( 876 exp.JSONArray, expressions=self._parse_csv(self._parse_bitwise) 877 ), 878 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 879 "PREDICT": lambda self: self._parse_ml(exp.Predict), 880 "TRANSLATE": lambda self: self._parse_translate(), 881 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 882 "GENERATE_EMBEDDING": lambda self: self._parse_ml(exp.GenerateEmbedding), 883 "GENERATE_TEXT_EMBEDDING": lambda self: self._parse_ml( 884 exp.GenerateEmbedding, is_text=True 885 ), 886 "VECTOR_SEARCH": lambda self: self._parse_vector_search(), 887 "FORECAST": lambda self: self._parse_ml(exp.MLForecast), 888 } 889 FUNCTION_PARSERS.pop("TRIM") 890 891 NO_PAREN_FUNCTIONS = { 892 **parser.Parser.NO_PAREN_FUNCTIONS, 893 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 894 } 895 896 NESTED_TYPE_TOKENS = { 897 *parser.Parser.NESTED_TYPE_TOKENS, 898 TokenType.TABLE, 899 } 900 901 PROPERTY_PARSERS = { 902 **parser.Parser.PROPERTY_PARSERS, 903 "NOT DETERMINISTIC": lambda self: self.expression( 904 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 905 ), 906 "OPTIONS": lambda self: self._parse_with_property(), 907 } 908 909 CONSTRAINT_PARSERS = { 910 **parser.Parser.CONSTRAINT_PARSERS, 911 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 912 } 913 914 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 915 RANGE_PARSERS.pop(TokenType.OVERLAPS) 916 917 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 918 919 STATEMENT_PARSERS = { 920 **parser.Parser.STATEMENT_PARSERS, 921 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 922 TokenType.END: lambda self: self._parse_as_command(self._prev), 923 TokenType.FOR: lambda self: self._parse_for_in(), 924 TokenType.EXPORT: lambda self: self._parse_export_data(), 925 TokenType.DECLARE: lambda self: self._parse_declare(), 926 } 927 928 BRACKET_OFFSETS = { 929 "OFFSET": (0, False), 930 "ORDINAL": (1, False), 931 "SAFE_OFFSET": (0, True), 932 "SAFE_ORDINAL": (1, True), 933 } 934 935 def _parse_for_in(self) -> t.Union[exp.ForIn, exp.Command]: 936 index = self._index 937 this = self._parse_range() 938 self._match_text_seq("DO") 939 if self._match(TokenType.COMMAND): 940 self._retreat(index) 941 return self._parse_as_command(self._prev) 942 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 943 944 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 945 this = super()._parse_table_part(schema=schema) or self._parse_number() 946 947 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 948 if isinstance(this, exp.Identifier): 949 table_name = this.name 950 while self._match(TokenType.DASH, advance=False) and self._next: 951 start = self._curr 952 while self._is_connected() and not self._match_set( 953 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 954 ): 955 self._advance() 956 957 if start == self._curr: 958 break 959 960 table_name += self._find_sql(start, self._prev) 961 962 this = exp.Identifier( 963 this=table_name, quoted=this.args.get("quoted") 964 ).update_positions(this) 965 elif isinstance(this, exp.Literal): 966 table_name = this.name 967 968 if self._is_connected() and self._parse_var(any_token=True): 969 table_name += self._prev.text 970 971 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 972 973 return this 974 975 def _parse_table_parts( 976 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 977 ) -> exp.Table: 978 table = super()._parse_table_parts( 979 schema=schema, is_db_reference=is_db_reference, wildcard=True 980 ) 981 982 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 983 if not table.catalog: 984 if table.db: 985 previous_db = table.args["db"] 986 parts = table.db.split(".") 987 if len(parts) == 2 and not table.args["db"].quoted: 988 table.set( 989 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 990 ) 991 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 992 else: 993 previous_this = table.this 994 parts = table.name.split(".") 995 if len(parts) == 2 and not table.this.quoted: 996 table.set( 997 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 998 ) 999 table.set( 1000 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 1001 ) 1002 1003 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 1004 alias = table.this 1005 catalog, db, this, *rest = ( 1006 exp.to_identifier(p, quoted=True) 1007 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 1008 ) 1009 1010 for part in (catalog, db, this): 1011 if part: 1012 part.update_positions(table.this) 1013 1014 if rest and this: 1015 this = exp.Dot.build([this, *rest]) # type: ignore 1016 1017 table = exp.Table( 1018 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 1019 ) 1020 table.meta["quoted_table"] = True 1021 else: 1022 alias = None 1023 1024 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 1025 # dataset, so if the project identifier is omitted we need to fix the ast so that 1026 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 1027 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 1028 # views, because it would seem like the "catalog" part is set, when it'd actually 1029 # be the region/dataset. Merging the two identifiers into a single one is done to 1030 # avoid producing a 4-part Table reference, which would cause issues in the schema 1031 # module, when there are 3-part table names mixed with information schema views. 1032 # 1033 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 1034 table_parts = table.parts 1035 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 1036 # We need to alias the table here to avoid breaking existing qualified columns. 1037 # This is expected to be safe, because if there's an actual alias coming up in 1038 # the token stream, it will overwrite this one. If there isn't one, we are only 1039 # exposing the name that can be used to reference the view explicitly (a no-op). 1040 exp.alias_( 1041 table, 1042 t.cast(exp.Identifier, alias or table_parts[-1]), 1043 table=True, 1044 copy=False, 1045 ) 1046 1047 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 1048 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 1049 line=table_parts[-2].meta.get("line"), 1050 col=table_parts[-1].meta.get("col"), 1051 start=table_parts[-2].meta.get("start"), 1052 end=table_parts[-1].meta.get("end"), 1053 ) 1054 table.set("this", new_this) 1055 table.set("db", seq_get(table_parts, -3)) 1056 table.set("catalog", seq_get(table_parts, -4)) 1057 1058 return table 1059 1060 def _parse_column(self) -> t.Optional[exp.Expression]: 1061 column = super()._parse_column() 1062 if isinstance(column, exp.Column): 1063 parts = column.parts 1064 if any("." in p.name for p in parts): 1065 catalog, db, table, this, *rest = ( 1066 exp.to_identifier(p, quoted=True) 1067 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 1068 ) 1069 1070 if rest and this: 1071 this = exp.Dot.build([this, *rest]) # type: ignore 1072 1073 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 1074 column.meta["quoted_column"] = True 1075 1076 return column 1077 1078 @t.overload 1079 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 1080 1081 @t.overload 1082 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 1083 1084 def _parse_json_object(self, agg=False): 1085 json_object = super()._parse_json_object() 1086 array_kv_pair = seq_get(json_object.expressions, 0) 1087 1088 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 1089 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 1090 if ( 1091 array_kv_pair 1092 and isinstance(array_kv_pair.this, exp.Array) 1093 and isinstance(array_kv_pair.expression, exp.Array) 1094 ): 1095 keys = array_kv_pair.this.expressions 1096 values = array_kv_pair.expression.expressions 1097 1098 json_object.set( 1099 "expressions", 1100 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 1101 ) 1102 1103 return json_object 1104 1105 def _parse_bracket( 1106 self, this: t.Optional[exp.Expression] = None 1107 ) -> t.Optional[exp.Expression]: 1108 bracket = super()._parse_bracket(this) 1109 1110 if this is bracket: 1111 return bracket 1112 1113 if isinstance(bracket, exp.Bracket): 1114 for expression in bracket.expressions: 1115 name = expression.name.upper() 1116 1117 if name not in self.BRACKET_OFFSETS: 1118 break 1119 1120 offset, safe = self.BRACKET_OFFSETS[name] 1121 bracket.set("offset", offset) 1122 bracket.set("safe", safe) 1123 expression.replace(expression.expressions[0]) 1124 1125 return bracket 1126 1127 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 1128 unnest = super()._parse_unnest(with_alias=with_alias) 1129 1130 if not unnest: 1131 return None 1132 1133 unnest_expr = seq_get(unnest.expressions, 0) 1134 if unnest_expr: 1135 from sqlglot.optimizer.annotate_types import annotate_types 1136 1137 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 1138 1139 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 1140 # in contrast to other dialects such as DuckDB which flattens only the array by default 1141 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 1142 array_elem.is_type(exp.DataType.Type.STRUCT) 1143 for array_elem in unnest_expr._type.expressions 1144 ): 1145 unnest.set("explode_array", True) 1146 1147 return unnest 1148 1149 def _parse_make_interval(self) -> exp.MakeInterval: 1150 expr = exp.MakeInterval() 1151 1152 for arg_key in expr.arg_types: 1153 value = self._parse_lambda() 1154 1155 if not value: 1156 break 1157 1158 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 1159 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 1160 if isinstance(value, exp.Kwarg): 1161 arg_key = value.this.name 1162 1163 expr.set(arg_key, value) 1164 1165 self._match(TokenType.COMMA) 1166 1167 return expr 1168 1169 def _parse_ml(self, expr_type: t.Type[E], **kwargs) -> E: 1170 self._match_text_seq("MODEL") 1171 this = self._parse_table() 1172 1173 self._match(TokenType.COMMA) 1174 self._match_text_seq("TABLE") 1175 1176 # Certain functions like ML.FORECAST require a STRUCT argument but not a TABLE/SELECT one 1177 expression = ( 1178 self._parse_table() if not self._match(TokenType.STRUCT, advance=False) else None 1179 ) 1180 1181 self._match(TokenType.COMMA) 1182 1183 return self.expression( 1184 expr_type, 1185 this=this, 1186 expression=expression, 1187 params_struct=self._parse_bitwise(), 1188 **kwargs, 1189 ) 1190 1191 def _parse_translate(self) -> exp.Translate | exp.MLTranslate: 1192 # Check if this is ML.TRANSLATE by looking at previous tokens 1193 token = seq_get(self._tokens, self._index - 4) 1194 if token and token.text.upper() == "ML": 1195 return self._parse_ml(exp.MLTranslate) 1196 1197 return exp.Translate.from_arg_list(self._parse_function_args()) 1198 1199 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 1200 self._match(TokenType.TABLE) 1201 this = self._parse_table() 1202 1203 expr = self.expression(exp.FeaturesAtTime, this=this) 1204 1205 while self._match(TokenType.COMMA): 1206 arg = self._parse_lambda() 1207 1208 # Get the LHS of the Kwarg and set the arg to that value, e.g 1209 # "num_rows => 1" sets the expr's `num_rows` arg 1210 if arg: 1211 expr.set(arg.this.name, arg) 1212 1213 return expr 1214 1215 def _parse_vector_search(self) -> exp.VectorSearch: 1216 self._match(TokenType.TABLE) 1217 base_table = self._parse_table() 1218 1219 self._match(TokenType.COMMA) 1220 1221 column_to_search = self._parse_bitwise() 1222 self._match(TokenType.COMMA) 1223 1224 self._match(TokenType.TABLE) 1225 query_table = self._parse_table() 1226 1227 expr = self.expression( 1228 exp.VectorSearch, 1229 this=base_table, 1230 column_to_search=column_to_search, 1231 query_table=query_table, 1232 ) 1233 1234 while self._match(TokenType.COMMA): 1235 # query_column_to_search can be named argument or positional 1236 if self._match(TokenType.STRING, advance=False): 1237 query_column = self._parse_string() 1238 expr.set("query_column_to_search", query_column) 1239 else: 1240 arg = self._parse_lambda() 1241 if arg: 1242 expr.set(arg.this.name, arg) 1243 1244 return expr 1245 1246 def _parse_export_data(self) -> exp.Export: 1247 self._match_text_seq("DATA") 1248 1249 return self.expression( 1250 exp.Export, 1251 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 1252 options=self._parse_properties(), 1253 this=self._match_text_seq("AS") and self._parse_select(), 1254 )
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Arguments:
- error_level: The desired error level. Default: ErrorLevel.IMMEDIATE
- error_message_context: The amount of context to capture from a query string when displaying the error message (in number of characters). Default: 100
- max_errors: Maximum number of error messages to include in a raised ParseError. This is only relevant if error_level is ErrorLevel.RAISE. Default: 3
Inherited Members
- sqlglot.parser.Parser
- Parser
- STRUCT_TYPE_TOKENS
- ENUM_TYPE_TOKENS
- AGGREGATE_TYPE_TOKENS
- TYPE_TOKENS
- SIGNED_TO_UNSIGNED_TYPE_TOKEN
- SUBQUERY_PREDICATES
- RESERVED_TOKENS
- DB_CREATABLES
- CREATABLES
- ALTERABLES
- COLON_PLACEHOLDER_TOKENS
- ARRAY_CONSTRUCTORS
- TRIM_TYPES
- FUNC_TOKENS
- CONJUNCTION
- ASSIGNMENT
- DISJUNCTION
- EQUALITY
- COMPARISON
- BITWISE
- TERM
- FACTOR
- EXPONENT
- TIMES
- TIMESTAMPS
- SET_OPERATIONS
- JOIN_METHODS
- JOIN_SIDES
- JOIN_KINDS
- JOIN_HINTS
- LAMBDAS
- COLUMN_OPERATORS
- CAST_COLUMN_OPERATORS
- EXPRESSION_PARSERS
- UNARY_PARSERS
- STRING_PARSERS
- NUMERIC_PARSERS
- PRIMARY_PARSERS
- PLACEHOLDER_PARSERS
- PIPE_SYNTAX_TRANSFORM_PARSERS
- ALTER_PARSERS
- ALTER_ALTER_PARSERS
- SCHEMA_UNNAMED_CONSTRAINTS
- NO_PAREN_FUNCTION_PARSERS
- INVALID_FUNC_NAME_TOKENS
- FUNCTIONS_WITH_ALIASED_ARGS
- KEY_VALUE_DEFINITIONS
- QUERY_MODIFIER_PARSERS
- QUERY_MODIFIER_TOKENS
- SET_PARSERS
- SHOW_PARSERS
- TYPE_LITERAL_PARSERS
- TYPE_CONVERTERS
- DDL_SELECT_TOKENS
- PRE_VOLATILE_TOKENS
- TRANSACTION_KIND
- TRANSACTION_CHARACTERISTICS
- CONFLICT_ACTIONS
- CREATE_SEQUENCE
- ISOLATED_LOADING_OPTIONS
- USABLES
- CAST_ACTIONS
- SCHEMA_BINDING_OPTIONS
- PROCEDURE_OPTIONS
- EXECUTE_AS_OPTIONS
- KEY_CONSTRAINT_OPTIONS
- WINDOW_EXCLUDE_OPTIONS
- INSERT_ALTERNATIVES
- CLONE_KEYWORDS
- HISTORICAL_DATA_PREFIX
- HISTORICAL_DATA_KIND
- OPCLASS_FOLLOW_KEYWORDS
- OPTYPE_FOLLOW_TOKENS
- TABLE_INDEX_HINT_TOKENS
- VIEW_ATTRIBUTES
- WINDOW_ALIAS_TOKENS
- WINDOW_BEFORE_PAREN_TOKENS
- WINDOW_SIDES
- JSON_KEY_VALUE_SEPARATOR_TOKENS
- FETCH_TOKENS
- ADD_CONSTRAINT_TOKENS
- DISTINCT_TOKENS
- UNNEST_OFFSET_ALIAS_TOKENS
- SELECT_START_TOKENS
- COPY_INTO_VARLEN_OPTIONS
- IS_JSON_PREDICATE_KIND
- ODBC_DATETIME_LITERALS
- ON_CONDITION_TOKENS
- PRIVILEGE_FOLLOW_TOKENS
- DESCRIBE_STYLES
- ANALYZE_STYLES
- ANALYZE_EXPRESSION_PARSERS
- PARTITION_KEYWORDS
- AMBIGUOUS_ALIAS_TOKENS
- OPERATION_MODIFIERS
- RECURSIVE_CTE_SEARCH_KIND
- MODIFIABLES
- STRICT_CAST
- IDENTIFY_PIVOT_STRINGS
- TABLESAMPLE_CSV
- DEFAULT_SAMPLING_METHOD
- SET_REQUIRES_ASSIGNMENT_DELIMITER
- TRIM_PATTERN_FIRST
- STRING_ALIASES
- MODIFIERS_ATTACHED_TO_SET_OP
- SET_OP_MODIFIERS
- NO_PAREN_IF_COMMANDS
- JSON_ARROWS_REQUIRE_JSON_TYPE
- COLON_IS_VARIANT_EXTRACT
- VALUES_FOLLOWED_BY_PAREN
- INTERVAL_SPANS
- SUPPORTS_PARTITION_SELECTION
- WRAPPED_TRANSFORM_COLUMN_CONSTRAINT
- OPTIONAL_ALIAS_TOKEN_CTE
- ALTER_RENAME_REQUIRES_COLUMN
- ZONE_AWARE_TIMESTAMP_CONSTRUCTOR
- MAP_KEYS_ARE_ARBITRARY_EXPRESSIONS
- JSON_EXTRACT_REQUIRES_JSON_EXPRESSION
- ADD_JOIN_ON_TRUE
- SUPPORTS_OMITTED_INTERVAL_SPAN_UNIT
- error_level
- error_message_context
- max_errors
- dialect
- reset
- parse
- parse_into
- check_errors
- raise_error
- expression
- validate_expression
- parse_set_operation
- build_cast
- errors
- sql
1256 class Generator(generator.Generator): 1257 INTERVAL_ALLOWS_PLURAL_FORM = False 1258 JOIN_HINTS = False 1259 QUERY_HINTS = False 1260 TABLE_HINTS = False 1261 LIMIT_FETCH = "LIMIT" 1262 RENAME_TABLE_WITH_DB = False 1263 NVL2_SUPPORTED = False 1264 UNNEST_WITH_ORDINALITY = False 1265 COLLATE_IS_FUNC = True 1266 LIMIT_ONLY_LITERALS = True 1267 SUPPORTS_TABLE_ALIAS_COLUMNS = False 1268 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 1269 JSON_KEY_VALUE_PAIR_SEP = "," 1270 NULL_ORDERING_SUPPORTED = False 1271 IGNORE_NULLS_IN_FUNC = True 1272 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 1273 CAN_IMPLEMENT_ARRAY_ANY = True 1274 SUPPORTS_TO_NUMBER = False 1275 NAMED_PLACEHOLDER_TOKEN = "@" 1276 HEX_FUNC = "TO_HEX" 1277 WITH_PROPERTIES_PREFIX = "OPTIONS" 1278 SUPPORTS_EXPLODING_PROJECTIONS = False 1279 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 1280 SUPPORTS_UNIX_SECONDS = True 1281 1282 SAFE_JSON_PATH_KEY_RE = re.compile(r"^[_\-a-zA-Z][\-\w]*$") 1283 1284 TS_OR_DS_TYPES = ( 1285 exp.TsOrDsToDatetime, 1286 exp.TsOrDsToTimestamp, 1287 exp.TsOrDsToTime, 1288 exp.TsOrDsToDate, 1289 ) 1290 1291 TRANSFORMS = { 1292 **generator.Generator.TRANSFORMS, 1293 exp.ApproxTopK: rename_func("APPROX_TOP_COUNT"), 1294 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 1295 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 1296 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 1297 exp.Array: inline_array_unless_query, 1298 exp.ArrayContains: _array_contains_sql, 1299 exp.ArrayFilter: filter_array_using_unnest, 1300 exp.ArrayRemove: filter_array_using_unnest, 1301 exp.BitwiseAndAgg: rename_func("BIT_AND"), 1302 exp.BitwiseOrAgg: rename_func("BIT_OR"), 1303 exp.BitwiseXorAgg: rename_func("BIT_XOR"), 1304 exp.BitwiseCountAgg: rename_func("BIT_COUNT"), 1305 exp.ByteLength: rename_func("BYTE_LENGTH"), 1306 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 1307 exp.CollateProperty: lambda self, e: ( 1308 f"DEFAULT COLLATE {self.sql(e, 'this')}" 1309 if e.args.get("default") 1310 else f"COLLATE {self.sql(e, 'this')}" 1311 ), 1312 exp.Commit: lambda *_: "COMMIT TRANSACTION", 1313 exp.CountIf: rename_func("COUNTIF"), 1314 exp.Create: _create_sql, 1315 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 1316 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 1317 exp.DateDiff: lambda self, e: self.func( 1318 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 1319 ), 1320 exp.DateFromParts: rename_func("DATE"), 1321 exp.DateStrToDate: datestrtodate_sql, 1322 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 1323 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 1324 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 1325 exp.DateFromUnixDate: rename_func("DATE_FROM_UNIX_DATE"), 1326 exp.FromTimeZone: lambda self, e: self.func( 1327 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 1328 ), 1329 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 1330 exp.GroupConcat: lambda self, e: groupconcat_sql( 1331 self, e, func_name="STRING_AGG", within_group=False 1332 ), 1333 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 1334 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 1335 exp.If: if_sql(false_value="NULL"), 1336 exp.ILike: no_ilike_sql, 1337 exp.IntDiv: rename_func("DIV"), 1338 exp.Int64: rename_func("INT64"), 1339 exp.JSONBool: rename_func("BOOL"), 1340 exp.JSONExtract: _json_extract_sql, 1341 exp.JSONExtractArray: _json_extract_sql, 1342 exp.JSONExtractScalar: _json_extract_sql, 1343 exp.JSONFormat: lambda self, e: self.func( 1344 "TO_JSON" if e.args.get("to_json") else "TO_JSON_STRING", 1345 e.this, 1346 e.args.get("options"), 1347 ), 1348 exp.JSONKeysAtDepth: rename_func("JSON_KEYS"), 1349 exp.JSONValueArray: rename_func("JSON_VALUE_ARRAY"), 1350 exp.Levenshtein: _levenshtein_sql, 1351 exp.Max: max_or_greatest, 1352 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 1353 exp.MD5Digest: rename_func("MD5"), 1354 exp.Min: min_or_least, 1355 exp.Normalize: lambda self, e: self.func( 1356 "NORMALIZE_AND_CASEFOLD" if e.args.get("is_casefold") else "NORMALIZE", 1357 e.this, 1358 e.args.get("form"), 1359 ), 1360 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1361 exp.RegexpExtract: lambda self, e: self.func( 1362 "REGEXP_EXTRACT", 1363 e.this, 1364 e.expression, 1365 e.args.get("position"), 1366 e.args.get("occurrence"), 1367 ), 1368 exp.RegexpExtractAll: lambda self, e: self.func( 1369 "REGEXP_EXTRACT_ALL", e.this, e.expression 1370 ), 1371 exp.RegexpReplace: regexp_replace_sql, 1372 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1373 exp.ReturnsProperty: _returnsproperty_sql, 1374 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1375 exp.ParseTime: lambda self, e: self.func("PARSE_TIME", self.format_time(e), e.this), 1376 exp.ParseDatetime: lambda self, e: self.func( 1377 "PARSE_DATETIME", self.format_time(e), e.this 1378 ), 1379 exp.Select: transforms.preprocess( 1380 [ 1381 transforms.explode_projection_to_unnest(), 1382 transforms.unqualify_unnest, 1383 transforms.eliminate_distinct_on, 1384 _alias_ordered_group, 1385 transforms.eliminate_semi_and_anti_joins, 1386 ] 1387 ), 1388 exp.SHA: rename_func("SHA1"), 1389 exp.SHA2: sha256_sql, 1390 exp.StabilityProperty: lambda self, e: ( 1391 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1392 ), 1393 exp.String: rename_func("STRING"), 1394 exp.StrPosition: lambda self, e: ( 1395 strposition_sql( 1396 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1397 ) 1398 ), 1399 exp.StrToDate: _str_to_datetime_sql, 1400 exp.StrToTime: _str_to_datetime_sql, 1401 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1402 exp.TimeFromParts: rename_func("TIME"), 1403 exp.TimestampFromParts: rename_func("DATETIME"), 1404 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1405 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1406 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1407 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1408 exp.TimeStrToTime: timestrtotime_sql, 1409 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1410 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1411 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1412 exp.TsOrDsToTime: rename_func("TIME"), 1413 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1414 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1415 exp.Unhex: rename_func("FROM_HEX"), 1416 exp.UnixDate: rename_func("UNIX_DATE"), 1417 exp.UnixToTime: _unix_to_time_sql, 1418 exp.Uuid: lambda *_: "GENERATE_UUID()", 1419 exp.Values: _derived_table_values_to_unnest, 1420 exp.VariancePop: rename_func("VAR_POP"), 1421 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1422 } 1423 1424 SUPPORTED_JSON_PATH_PARTS = { 1425 exp.JSONPathKey, 1426 exp.JSONPathRoot, 1427 exp.JSONPathSubscript, 1428 } 1429 1430 TYPE_MAPPING = { 1431 **generator.Generator.TYPE_MAPPING, 1432 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1433 exp.DataType.Type.BIGINT: "INT64", 1434 exp.DataType.Type.BINARY: "BYTES", 1435 exp.DataType.Type.BLOB: "BYTES", 1436 exp.DataType.Type.BOOLEAN: "BOOL", 1437 exp.DataType.Type.CHAR: "STRING", 1438 exp.DataType.Type.DECIMAL: "NUMERIC", 1439 exp.DataType.Type.DOUBLE: "FLOAT64", 1440 exp.DataType.Type.FLOAT: "FLOAT64", 1441 exp.DataType.Type.INT: "INT64", 1442 exp.DataType.Type.NCHAR: "STRING", 1443 exp.DataType.Type.NVARCHAR: "STRING", 1444 exp.DataType.Type.SMALLINT: "INT64", 1445 exp.DataType.Type.TEXT: "STRING", 1446 exp.DataType.Type.TIMESTAMP: "DATETIME", 1447 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1448 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1449 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1450 exp.DataType.Type.TINYINT: "INT64", 1451 exp.DataType.Type.ROWVERSION: "BYTES", 1452 exp.DataType.Type.UUID: "STRING", 1453 exp.DataType.Type.VARBINARY: "BYTES", 1454 exp.DataType.Type.VARCHAR: "STRING", 1455 exp.DataType.Type.VARIANT: "ANY TYPE", 1456 } 1457 1458 PROPERTIES_LOCATION = { 1459 **generator.Generator.PROPERTIES_LOCATION, 1460 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1461 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1462 } 1463 1464 # WINDOW comes after QUALIFY 1465 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1466 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1467 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1468 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1469 } 1470 1471 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1472 RESERVED_KEYWORDS = { 1473 "all", 1474 "and", 1475 "any", 1476 "array", 1477 "as", 1478 "asc", 1479 "assert_rows_modified", 1480 "at", 1481 "between", 1482 "by", 1483 "case", 1484 "cast", 1485 "collate", 1486 "contains", 1487 "create", 1488 "cross", 1489 "cube", 1490 "current", 1491 "default", 1492 "define", 1493 "desc", 1494 "distinct", 1495 "else", 1496 "end", 1497 "enum", 1498 "escape", 1499 "except", 1500 "exclude", 1501 "exists", 1502 "extract", 1503 "false", 1504 "fetch", 1505 "following", 1506 "for", 1507 "from", 1508 "full", 1509 "group", 1510 "grouping", 1511 "groups", 1512 "hash", 1513 "having", 1514 "if", 1515 "ignore", 1516 "in", 1517 "inner", 1518 "intersect", 1519 "interval", 1520 "into", 1521 "is", 1522 "join", 1523 "lateral", 1524 "left", 1525 "like", 1526 "limit", 1527 "lookup", 1528 "merge", 1529 "natural", 1530 "new", 1531 "no", 1532 "not", 1533 "null", 1534 "nulls", 1535 "of", 1536 "on", 1537 "or", 1538 "order", 1539 "outer", 1540 "over", 1541 "partition", 1542 "preceding", 1543 "proto", 1544 "qualify", 1545 "range", 1546 "recursive", 1547 "respect", 1548 "right", 1549 "rollup", 1550 "rows", 1551 "select", 1552 "set", 1553 "some", 1554 "struct", 1555 "tablesample", 1556 "then", 1557 "to", 1558 "treat", 1559 "true", 1560 "unbounded", 1561 "union", 1562 "unnest", 1563 "using", 1564 "when", 1565 "where", 1566 "window", 1567 "with", 1568 "within", 1569 } 1570 1571 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1572 unit = expression.unit 1573 unit_sql = unit.name if unit.is_string else self.sql(unit) 1574 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1575 1576 def mod_sql(self, expression: exp.Mod) -> str: 1577 this = expression.this 1578 expr = expression.expression 1579 return self.func( 1580 "MOD", 1581 this.unnest() if isinstance(this, exp.Paren) else this, 1582 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1583 ) 1584 1585 def column_parts(self, expression: exp.Column) -> str: 1586 if expression.meta.get("quoted_column"): 1587 # If a column reference is of the form `dataset.table`.name, we need 1588 # to preserve the quoted table path, otherwise the reference breaks 1589 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1590 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1591 return f"{table_path}.{self.sql(expression, 'this')}" 1592 1593 return super().column_parts(expression) 1594 1595 def table_parts(self, expression: exp.Table) -> str: 1596 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1597 # we need to make sure the correct quoting is used in each case. 1598 # 1599 # For example, if there is a CTE x that clashes with a schema name, then the former will 1600 # return the table y in that schema, whereas the latter will return the CTE's y column: 1601 # 1602 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1603 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1604 if expression.meta.get("quoted_table"): 1605 table_parts = ".".join(p.name for p in expression.parts) 1606 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1607 1608 return super().table_parts(expression) 1609 1610 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1611 this = expression.this 1612 if isinstance(this, exp.TsOrDsToDatetime): 1613 func_name = "FORMAT_DATETIME" 1614 elif isinstance(this, exp.TsOrDsToTimestamp): 1615 func_name = "FORMAT_TIMESTAMP" 1616 elif isinstance(this, exp.TsOrDsToTime): 1617 func_name = "FORMAT_TIME" 1618 else: 1619 func_name = "FORMAT_DATE" 1620 1621 time_expr = this if isinstance(this, self.TS_OR_DS_TYPES) else expression 1622 return self.func( 1623 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1624 ) 1625 1626 def eq_sql(self, expression: exp.EQ) -> str: 1627 # Operands of = cannot be NULL in BigQuery 1628 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1629 if not isinstance(expression.parent, exp.Update): 1630 return "NULL" 1631 1632 return self.binary(expression, "=") 1633 1634 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1635 parent = expression.parent 1636 1637 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1638 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1639 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1640 return self.func( 1641 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1642 ) 1643 1644 return super().attimezone_sql(expression) 1645 1646 def trycast_sql(self, expression: exp.TryCast) -> str: 1647 return self.cast_sql(expression, safe_prefix="SAFE_") 1648 1649 def bracket_sql(self, expression: exp.Bracket) -> str: 1650 this = expression.this 1651 expressions = expression.expressions 1652 1653 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1654 arg = expressions[0] 1655 if arg.type is None: 1656 from sqlglot.optimizer.annotate_types import annotate_types 1657 1658 arg = annotate_types(arg, dialect=self.dialect) 1659 1660 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1661 # BQ doesn't support bracket syntax with string values for structs 1662 return f"{self.sql(this)}.{arg.name}" 1663 1664 expressions_sql = self.expressions(expression, flat=True) 1665 offset = expression.args.get("offset") 1666 1667 if offset == 0: 1668 expressions_sql = f"OFFSET({expressions_sql})" 1669 elif offset == 1: 1670 expressions_sql = f"ORDINAL({expressions_sql})" 1671 elif offset is not None: 1672 self.unsupported(f"Unsupported array offset: {offset}") 1673 1674 if expression.args.get("safe"): 1675 expressions_sql = f"SAFE_{expressions_sql}" 1676 1677 return f"{self.sql(this)}[{expressions_sql}]" 1678 1679 def in_unnest_op(self, expression: exp.Unnest) -> str: 1680 return self.sql(expression) 1681 1682 def version_sql(self, expression: exp.Version) -> str: 1683 if expression.name == "TIMESTAMP": 1684 expression.set("this", "SYSTEM_TIME") 1685 return super().version_sql(expression) 1686 1687 def contains_sql(self, expression: exp.Contains) -> str: 1688 this = expression.this 1689 expr = expression.expression 1690 1691 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1692 this = this.this 1693 expr = expr.this 1694 1695 return self.func("CONTAINS_SUBSTR", this, expr, expression.args.get("json_scope")) 1696 1697 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1698 this = expression.this 1699 1700 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1701 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1702 # because they aren't literals and so the above syntax is invalid BigQuery. 1703 if isinstance(this, exp.Array): 1704 elem = seq_get(this.expressions, 0) 1705 if not (elem and elem.find(exp.Query)): 1706 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1707 1708 return super().cast_sql(expression, safe_prefix=safe_prefix) 1709 1710 def declareitem_sql(self, expression: exp.DeclareItem) -> str: 1711 variables = self.expressions(expression, "this") 1712 default = self.sql(expression, "default") 1713 default = f" DEFAULT {default}" if default else "" 1714 kind = self.sql(expression, "kind") 1715 kind = f" {kind}" if kind else "" 1716 1717 return f"{variables}{kind}{default}"
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True or 'always': Always quote. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHEREclause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
1585 def column_parts(self, expression: exp.Column) -> str: 1586 if expression.meta.get("quoted_column"): 1587 # If a column reference is of the form `dataset.table`.name, we need 1588 # to preserve the quoted table path, otherwise the reference breaks 1589 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1590 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1591 return f"{table_path}.{self.sql(expression, 'this')}" 1592 1593 return super().column_parts(expression)
1595 def table_parts(self, expression: exp.Table) -> str: 1596 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1597 # we need to make sure the correct quoting is used in each case. 1598 # 1599 # For example, if there is a CTE x that clashes with a schema name, then the former will 1600 # return the table y in that schema, whereas the latter will return the CTE's y column: 1601 # 1602 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1603 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1604 if expression.meta.get("quoted_table"): 1605 table_parts = ".".join(p.name for p in expression.parts) 1606 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1607 1608 return super().table_parts(expression)
1610 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1611 this = expression.this 1612 if isinstance(this, exp.TsOrDsToDatetime): 1613 func_name = "FORMAT_DATETIME" 1614 elif isinstance(this, exp.TsOrDsToTimestamp): 1615 func_name = "FORMAT_TIMESTAMP" 1616 elif isinstance(this, exp.TsOrDsToTime): 1617 func_name = "FORMAT_TIME" 1618 else: 1619 func_name = "FORMAT_DATE" 1620 1621 time_expr = this if isinstance(this, self.TS_OR_DS_TYPES) else expression 1622 return self.func( 1623 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1624 )
1626 def eq_sql(self, expression: exp.EQ) -> str: 1627 # Operands of = cannot be NULL in BigQuery 1628 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1629 if not isinstance(expression.parent, exp.Update): 1630 return "NULL" 1631 1632 return self.binary(expression, "=")
1634 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1635 parent = expression.parent 1636 1637 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1638 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1639 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1640 return self.func( 1641 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1642 ) 1643 1644 return super().attimezone_sql(expression)
1649 def bracket_sql(self, expression: exp.Bracket) -> str: 1650 this = expression.this 1651 expressions = expression.expressions 1652 1653 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1654 arg = expressions[0] 1655 if arg.type is None: 1656 from sqlglot.optimizer.annotate_types import annotate_types 1657 1658 arg = annotate_types(arg, dialect=self.dialect) 1659 1660 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1661 # BQ doesn't support bracket syntax with string values for structs 1662 return f"{self.sql(this)}.{arg.name}" 1663 1664 expressions_sql = self.expressions(expression, flat=True) 1665 offset = expression.args.get("offset") 1666 1667 if offset == 0: 1668 expressions_sql = f"OFFSET({expressions_sql})" 1669 elif offset == 1: 1670 expressions_sql = f"ORDINAL({expressions_sql})" 1671 elif offset is not None: 1672 self.unsupported(f"Unsupported array offset: {offset}") 1673 1674 if expression.args.get("safe"): 1675 expressions_sql = f"SAFE_{expressions_sql}" 1676 1677 return f"{self.sql(this)}[{expressions_sql}]"
1687 def contains_sql(self, expression: exp.Contains) -> str: 1688 this = expression.this 1689 expr = expression.expression 1690 1691 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1692 this = this.this 1693 expr = expr.this 1694 1695 return self.func("CONTAINS_SUBSTR", this, expr, expression.args.get("json_scope"))
1697 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1698 this = expression.this 1699 1700 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1701 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1702 # because they aren't literals and so the above syntax is invalid BigQuery. 1703 if isinstance(this, exp.Array): 1704 elem = seq_get(this.expressions, 0) 1705 if not (elem and elem.find(exp.Query)): 1706 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1707 1708 return super().cast_sql(expression, safe_prefix=safe_prefix)
1710 def declareitem_sql(self, expression: exp.DeclareItem) -> str: 1711 variables = self.expressions(expression, "this") 1712 default = self.sql(expression, "default") 1713 default = f" DEFAULT {default}" if default else "" 1714 kind = self.sql(expression, "kind") 1715 kind = f" {kind}" if kind else "" 1716 1717 return f"{variables}{kind}{default}"
Inherited Members
- sqlglot.generator.Generator
- Generator
- LOCKING_READS_SUPPORTED
- WRAP_DERIVED_VALUES
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SINGLE_STRING_INTERVAL
- GROUPINGS_SEP
- INDEX_ON
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- SELECT_KINDS
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- AGGREGATE_FILTER_SUPPORTED
- SEMI_ANTI_JOIN_WITH_SIDE
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_REQUIRES_PARENS
- TABLESAMPLE_SIZE_IS_ROWS
- TABLESAMPLE_KEYWORDS
- TABLESAMPLE_WITH_METHOD
- TABLESAMPLE_SEED_KEYWORD
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- LAST_DAY_SUPPORTS_DATE_PART
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- SUPPORTS_CREATE_TABLE_LIKE
- LIKE_PROPERTY_INSIDE_SCHEMA
- MULTI_ARG_DISTINCT
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_BRACKETED_KEY_SUPPORTED
- SUPPORTS_WINDOW_EXCLUDE
- SET_OP_MODIFIERS
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- COPY_HAS_INTO_KEYWORD
- UNICODE_SUBSTITUTE
- STAR_EXCEPT
- QUOTE_JSON_PATH
- PAD_FILL_PATTERN_IS_REQUIRED
- ARRAY_CONCAT_IS_VAR_LEN
- SUPPORTS_CONVERT_TIMEZONE
- SUPPORTS_MEDIAN
- ALTER_SET_WRAPPED
- NORMALIZE_EXTRACT_DATE_PARTS
- PARSE_JSON_NAME
- ARRAY_SIZE_NAME
- ALTER_SET_TYPE
- ARRAY_SIZE_DIM_REQUIRED
- SUPPORTS_BETWEEN_FLAGS
- SUPPORTS_LIKE_QUANTIFIERS
- MATCH_AGAINST_TABLE_PREFIX
- UNSUPPORTED_TYPES
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- STRUCT_DELIMITER
- PARAMETER_TOKEN
- EXPRESSION_PRECEDES_PROPERTIES_CREATABLES
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- UNWRAPPED_INTERVAL_VALUES
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- RESPECT_IGNORE_NULLS_UNSUPPORTED_EXPRESSIONS
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- sanitize_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_sql
- columnposition_sql
- columndef_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- autoincrementcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- createable_sql
- create_sql
- sequenceproperties_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- cte_sql
- tablealias_sql
- bitstring_sql
- hexstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- datatype_sql
- directory_sql
- delete_sql
- drop_sql
- set_operation
- set_operations
- fetch_sql
- limitoptions_sql
- filter_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- hex_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- partition_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- likeproperty_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_sql
- tablefromrows_sql
- tablesample_sql
- pivot_sql
- tuple_sql
- update_sql
- values_sql
- var_sql
- into_sql
- from_sql
- groupingsets_sql
- rollup_sql
- cube_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- join_sql
- lambda_sql
- lateral_op
- lateral_sql
- limit_sql
- offset_sql
- setitem_sql
- set_sql
- queryband_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- for_modifiers
- queryoption_sql
- offset_limit_modifiers
- after_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- placeholder_sql
- subquery_sql
- qualify_sql
- unnest_sql
- prewhere_sql
- where_sql
- window_sql
- partition_by_sql
- windowspec_sql
- withingroup_sql
- between_sql
- bracket_offset_expressions
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- extract_sql
- trim_sql
- convert_concat_args
- concat_sql
- concatws_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- formatphrase_sql
- jsonobject_sql
- jsonobjectagg_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_sql
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- not_sql
- alias_sql
- pivotalias_sql
- aliases_sql
- atindex_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwisenot_sql
- bitwiseor_sql
- bitwiserightshift_sql
- bitwisexor_sql
- currentdate_sql
- collate_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- alterindex_sql
- alterdiststyle_sql
- altersortkey_sql
- alterrename_sql
- renamecolumn_sql
- alterset_sql
- alter_sql
- altersession_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- addpartition_sql
- distinct_sql
- ignorenulls_sql
- respectnulls_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- safedivide_sql
- overlaps_sql
- distance_sql
- dot_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- is_sql
- like_sql
- ilike_sql
- similarto_sql
- lt_sql
- lte_sql
- mul_sql
- neq_sql
- nullsafeeq_sql
- nullsafeneq_sql
- slice_sql
- sub_sql
- jsoncast_sql
- try_sql
- log_sql
- use_sql
- binary
- ceil_floor
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- whens_sql
- merge_sql
- tochar_sql
- tonumber_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- duplicatekeyproperty_sql
- uniquekeyproperty_sql
- distributedbyproperty_sql
- oncluster_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- indexcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- generateembedding_sql
- mltranslate_sql
- mlforecast_sql
- featuresattime_sql
- vectorsearch_sql
- forin_sql
- refresh_sql
- toarray_sql
- tsordstotime_sql
- tsordstotimestamp_sql
- tsordstodatetime_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- parsejson_sql
- rand_sql
- changes_sql
- pad_sql
- summarize_sql
- explodinggenerateseries_sql
- arrayconcat_sql
- converttimezone_sql
- json_sql
- jsonvalue_sql
- conditionalinsert_sql
- multitableinserts_sql
- oncondition_sql
- jsonextractquote_sql
- jsonexists_sql
- arrayagg_sql
- apply_sql
- grant_sql
- revoke_sql
- grantprivilege_sql
- grantprincipal_sql
- columns_sql
- overlay_sql
- todouble_sql
- string_sql
- median_sql
- overflowtruncatebehavior_sql
- unixseconds_sql
- arraysize_sql
- attach_sql
- detach_sql
- attachoption_sql
- watermarkcolumnconstraint_sql
- encodeproperty_sql
- includeproperty_sql
- xmlelement_sql
- xmlkeyvalueoption_sql
- partitionbyrangeproperty_sql
- partitionbyrangepropertydynamic_sql
- unpivotcolumns_sql
- analyzesample_sql
- analyzestatistics_sql
- analyzehistogram_sql
- analyzedelete_sql
- analyzelistchainedrows_sql
- analyzevalidate_sql
- analyze_sql
- xmltable_sql
- xmlnamespace_sql
- export_sql
- declare_sql
- recursivewithsearch_sql
- parameterizedagg_sql
- anonymousaggfunc_sql
- combinedaggfunc_sql
- combinedparameterizedagg_sql
- show_sql
- install_sql
- get_put_sql
- translatecharacters_sql
- decodecase_sql
- semanticview_sql
- getextract_sql
- datefromunixdate_sql
- space_sql
- buildproperty_sql
- refreshtriggerproperty_sql
- modelattribute_sql