sql-unifier/src/step1b-lib-core.sql at master · datasets-br/sql-unifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
/**
 * LIB v1.0 customized for SQL-Unifier project.
 * (no essential functions here, but all usefull)
 *
 * Useful functions library. All in only one schema, the LIB schema.
 * To "refresh version" drop cascade and redo all.
 */

DROP SCHEMA lib CASCADE;  -- caution (!) with existing libs
CREATE SCHEMA lib;


CREATE or replace FUNCTION lib.jtype_to_sql(text) RETURNS text AS $f$
	SELECT case
		WHEN $1='string' OR $1='null' THEN 'text'
		WHEN $1='number' THEN 'numeric'
		WHEN $1='array' THEN 'text[]'
		ELSE $1
	END
$f$ language SQL;


CREATE or replace FUNCTION lib.pg_varname(text) RETURNS text AS $f$
  SELECT regexp_replace(trim(regexp_replace( unaccent($1) , '[^\w0-9\(\)]+', '_', 'g'),'_'),'[\(\)_]+','_', 'g')
	--SELECT trim(regexp_replace( unaccent($1) , '[^\w0-9]+', '_', 'g'), '_')
$f$ language SQL;

CREATE or replace FUNCTION lib.pg_varname(text[]) RETURNS text[] AS $f$
	SELECT array_agg(lib.pg_varname(x)) FROM unnest($1) t(x);
$f$ language SQL;


/**
 * Percent avoiding divisions by zero.
 */
CREATE or replace FUNCTION lib.div_percent(
  float, float, -- a/b
  int DEFAULT NULL, -- 0-N decimal places or NULL for full
  boolean DEFAULT true -- returns zero when NULL inputs, else returns NULL
) RETURNS float AS $f$
   SELECT CASE
      WHEN $1 IS NULL OR $2 IS NULL THEN (CASE WHEN $4 THEN 0.0 ELSE NULL END)
      WHEN $1=0.0 THEN 0.0
      WHEN $2=0.0 THEN 'Infinity'::float
      ELSE CASE
        WHEN $3 IS NOT NULL AND $3>=0 THEN ROUND(100.0*$1/$2,$3)::float
        ELSE 100.0*$1/$2
      END
   END
$f$ language SQL IMMUTABLE;
CREATE or replace FUNCTION lib.div_percent(
  bigint, bigint, int DEFAULT NULL
) RETURNS float AS $wrap$
   SELECT lib.div_percent($1::float, $2::float, $3)
$wrap$ language SQL IMMUTABLE;
CREATE or replace FUNCTION lib.div_percent_int(bigint,bigint) RETURNS bigint AS $wrap$
   SELECT lib.div_percent($1::float, $2::float, 0)::bigint
$wrap$ language SQL IMMUTABLE;


-- -- -- -- -- --
-- Normalize and convert to integer-ranges, for postalCode_ranges.
-- See section "Preparation" at README.

CREATE or replace FUNCTION lib.csvranges_to_int4ranges(
  p_range text
) RETURNS int4range[] AS $f$
   SELECT ('{'||
      regexp_replace( translate($1,' -',',') , '\[(\d+),(\d+)\]', '"[\1,\2]"', 'g')
   || '}')::int4range[];
$f$ LANGUAGE SQL IMMUTABLE;


CREATE or replace FUNCTION lib.int4ranges_to_csvranges(
  p_range int4range[]
) RETURNS text AS $f$
   SELECT translate($1::text,',{}"',' ');
$f$ LANGUAGE SQL IMMUTABLE;


CREATE or replace FUNCTION lib.normalizeterm(
	--
	-- Converts string into standard sequence of lower-case words.
	--
	text,       		-- 1. input string (many words separed by spaces or punctuation)
	text DEFAULT ' ', 	-- 2. output separator
	int DEFAULT 320,	-- 3. max lenght of the result (system limit)
	p_sep2 text DEFAULT ' , ' -- 4. output separator between terms
) RETURNS text AS $f$
  SELECT  substring(
	LOWER(TRIM( regexp_replace(  -- for review: regex(regex()) for ` , , ` remove
		trim(regexp_replace($1,E'[\\n\\r \\+/,;:\\(\\)\\{\\}\\[\\]="\\s ]*[\\+/,;:\\(\\)\\{\\}\\[\\]="]+[\\+/,;:\\(\\)\\{\\}\\[\\]="\\s ]*|[\\s ]+[–\\-][\\s ]+',
				   p_sep2, 'g'),' ,'),   -- s*ps*|s-s
		E'[\\s ;\\|"]+[\\.\'][\\s ;\\|"]+|[\\s ;\\|"]+',    -- s.s|s
		$2,
		'g'
	), $2 )),
  1,$3
  );
$f$ LANGUAGE SQL IMMUTABLE;


CREATE or replace FUNCTION lib.msgcut(
  p_msg text, p_cutAt int DEFAULT 60
) RETURNS text AS $f$
  SELECT CASE WHEN $1=s THEN $1 ELSE s||'...' END FROM (SELECT substring($1,1,$2)) t(s);
$f$ LANGUAGE SQL IMMUTABLE;

-------------
-------------
----- HASH functions, to preserve name-id correspondence in the TUTORIALS and view names

CREATE or replace FUNCTION lib.sha1_cut7(p_word text) RETURNS int AS $f$
  -- About precise choice of truncation, see https://stackoverflow.com/q/4784335
  SELECT CASE WHEN $1='' THEN  0  ELSE ('x' || lpad(
      substr(encode(digest($1, 'sha1'), 'hex'), 1, 7)
      , 8, '0'
    ))::bit(32)::int END
$f$ LANGUAGE SQL IMMUTABLE;

CREATE or replace FUNCTION lib.hash_digits(
  p_word text,
  p_digits int DEFAULT 2,
  p_step int DEFAULT 2 -- minimal 2 to cut  never-zero-first
) RETURNS text AS $f$
  -- 0,1,2,..9,10,12,...,99,100,101,... Never 00 pad.
  SELECT CASE
      WHEN p_word IS NULL OR p_digits<1 OR p_step<2 OR p_step>6 THEN NULL
      WHEN p_word='' THEN '0'
      WHEN substr(x,1,1)='0' THEN  lib.hash_digits(p_word,p_digits,p_step+1)
      ELSE x
    END
  FROM (
    SELECT substr(i::text, p_step, p_digits) as x, i FROM (SELECT lib.sha1_cut7(p_word)) r(i)
  ) t
$f$ LANGUAGE SQL IMMUTABLE;


CREATE or replace FUNCTION lib.hash_digits_addnew(p_word text, p_list int[]) RETURNS int AS $f$
  -- Heuristic to good choice of minimal digits in hashed ID.
  -- for ns_id of dataset.ns, so we expect less tham 100 naspaces... but here work fine to 999
  -- see also n/k at https://en.wikipedia.org/wiki/Hash_table#Key_statistics
DECLARE
  len int;
  digits int;
  n int; -- new hash
  flag boolean;
BEGIN
  len    := array_length(p_list,1);
  digits := CASE WHEN len>9 THEN (CASE WHEN len>99 THEN 3 ELSE 2 END) ELSE 1 END; -- with coherence hypotesis for each id
  n      := lib.hash_digits(p_word,digits)::int;
  flag   := (p_list is not null AND len>0 AND n=ANY(p_list));
  IF flag THEN
    n    := lib.sha1_cut7(p_word) % (10*digits); -- like to use other hash, before +1
    flag := (n=ANY(p_list));
    IF flag AND ((digits=1 AND len>6) OR (digits=2 AND len>70)) THEN
        -- heuristic to reduce normal collisions at ~50%
      n    := lib.hash_digits(p_word,digits+1)::int;
      flag := (n=ANY(p_list));
    END IF;
    IF flag THEN
      SELECT min(x) INTO n
      FROM (SELECT unnest(p_list)+1 as x) t WHERE x NOT IN (SELECT unnest(p_list));
    END IF; -- flag2
  END IF; -- flag1
  RETURN n;
END
$f$ LANGUAGE PLpgSQL IMMUTABLE;


------------------
------------------
-- Intersection Lib-Framework

-- Error control, to JSON-RPC-like communication in the framework.

/**
 * Checks "response JSON object", returning TRUE when error.
 */
CREATE or replace FUNCTION lib.resp_is_error(JSONb) RETURNS boolean AS $f$
  SELECT CASE
      WHEN $1 IS NULL OR jsonb_typeof($1)='null' THEN false
      WHEN jsonb_typeof($1) IN ('object', 'array') THEN true
      ELSE true  -- something?  error of error?
    END
$f$ language SQL IMMUTABLE;

/**
 * Adds an error-response. If want to preserve "before error responses", flag that data is it.
 */
CREATE or replace FUNCTION lib.resp_error_add(
  code int,
  msg text DEFAULT NULL,
  data JSONb DEFAULT NULL
  --, data_isbefore DEFAULT false -- under construction, to change object-names.
) RETURNS JSONb AS $f$
  SELECT CASE
      WHEN $3 IS NULL OR jsonb_typeof($3)='null' THEN x
      ELSE jsonb_build_object('data',$3) || x
    END
  FROM (SELECT jsonb_build_object('code',$1, 'message',$2)) t(x)
$f$ language SQL IMMUTABLE;