Skip to content

Commit df0de84

Browse files
authored
Merge branch 'master' into feat/support-only-evaluating-survey-flags
2 parents ab25d92 + 3f5a9f8 commit df0de84

File tree

4 files changed

+168
-7
lines changed

4 files changed

+168
-7
lines changed

frontend/src/scenes/session-recordings/playlist/sessionRecordingsPlaylistLogic.test.ts

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -723,6 +723,29 @@ describe('sessionRecordingsPlaylistLogic', () => {
723723
})
724724
})
725725

726+
describe('set filters', () => {
727+
beforeEach(() => {
728+
logic = sessionRecordingsPlaylistLogic({
729+
key: 'cool_user_99',
730+
personUUID: 'cool_user_99',
731+
updateSearchParams: true,
732+
})
733+
logic.mount()
734+
})
735+
736+
it('resets date_to when given a relative date_from', async () => {
737+
await expectLogic(logic, () => {
738+
logic.actions.setFilters({
739+
date_from: '2021-10-01',
740+
date_to: '2021-10-10',
741+
})
742+
logic.actions.setFilters({
743+
date_from: '-7d',
744+
})
745+
}).toMatchValues({ filters: expect.objectContaining({ date_from: '-7d', date_to: null }) })
746+
})
747+
})
748+
726749
describe('convertUniversalFiltersToRecordingsQuery', () => {
727750
it('expands the visited_page filter to a pageview with $current_url property', () => {
728751
const result = convertUniversalFiltersToRecordingsQuery({

frontend/src/scenes/session-recordings/playlist/sessionRecordingsPlaylistLogic.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,8 @@ export interface SessionRecordingPlaylistLogicProps {
312312
onPinnedChange?: (recording: SessionRecordingType, pinned: boolean) => void
313313
}
314314

315+
const isRelativeDate = (x: RecordingUniversalFilters['date_from']): boolean => !!x && x.startsWith('-')
316+
315317
export const sessionRecordingsPlaylistLogic = kea<sessionRecordingsPlaylistLogicType>([
316318
path((key) => ['scenes', 'session-recordings', 'playlist', 'sessionRecordingsPlaylistLogic', key]),
317319
props({} as SessionRecordingPlaylistLogicProps),
@@ -492,8 +494,11 @@ export const sessionRecordingsPlaylistLogic = kea<sessionRecordingsPlaylistLogic
492494
})
493495
return getDefaultFilters(props.personUUID)
494496
}
497+
495498
return {
496499
...state,
500+
// if we're setting a relative date_from, then we need to clear the existing date_to
501+
date_to: filters.date_from && isRelativeDate(filters.date_from) ? null : state.date_to,
497502
...filters,
498503
}
499504
} catch (e) {

posthog/temporal/batch_exports/postgres_batch_export.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import dataclasses
66
import datetime as dt
77
import json
8+
import re
89
import typing
910

1011
import psycopg
@@ -59,6 +60,15 @@
5960
PostgreSQLField = tuple[str, typing.LiteralString]
6061
Fields = collections.abc.Iterable[PostgreSQLField]
6162

63+
# Compiled regex patterns for PostgreSQL data cleaning
64+
NULL_UNICODE_PATTERN = re.compile(rb"(?<!\\)\\u0000")
65+
UNPAIRED_SURROGATE_PATTERN = re.compile(
66+
rb"(\\u[dD][89A-Fa-f][0-9A-Fa-f]{2}\\u[dD][c-fC-F][0-9A-Fa-f]{2})|(\\u[dD][89A-Fa-f][0-9A-Fa-f]{2})"
67+
)
68+
UNPAIRED_SURROGATE_PATTERN_2 = re.compile(
69+
rb"(\\u[dD][89A-Fa-f][0-9A-Fa-f]{2}\\u[dD][c-fC-F][0-9A-Fa-f]{2})|(\\u[dD][c-fC-F][0-9A-Fa-f]{2})"
70+
)
71+
6272

6373
class PostgreSQLConnectionError(Exception):
6474
pass
@@ -418,12 +428,23 @@ async def copy_tsv_to_postgres(
418428
)
419429
) as copy:
420430
while data := await asyncio.to_thread(tsv_file.read):
421-
# \u0000 cannot be present in PostgreSQL's jsonb type, and will cause an error.
422-
# See: https://www.postgresql.org/docs/17/datatype-json.html
423-
data = data.replace(b"\\u0000", b"")
431+
data = remove_invalid_json(data)
424432
await copy.write(data)
425433

426434

435+
def remove_invalid_json(data: bytes) -> bytes:
436+
"""Remove invalid JSON from a byte string."""
437+
# \u0000 cannot be present in PostgreSQL's jsonb type, and will cause an error.
438+
# See: https://www.postgresql.org/docs/17/datatype-json.html
439+
# We use a regex to avoid replacing escaped \u0000 (for example, \\u0000, which we have seen in
440+
# some actual data)
441+
data = NULL_UNICODE_PATTERN.sub(b"", data)
442+
# Remove unpaired unicode surrogates
443+
data = UNPAIRED_SURROGATE_PATTERN.sub(rb"\1", data)
444+
data = UNPAIRED_SURROGATE_PATTERN_2.sub(rb"\1", data)
445+
return data
446+
447+
427448
def postgres_default_fields() -> list[BatchExportField]:
428449
batch_export_fields = default_fields()
429450
batch_export_fields.append(

posthog/temporal/tests/batch_exports/test_postgres_batch_export_workflow.py

Lines changed: 116 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import datetime as dt
55
import json
66
import operator
7+
import re
78
import unittest.mock
89
import uuid
910

@@ -37,6 +38,7 @@
3738
PostgreSQLHeartbeatDetails,
3839
insert_into_postgres_activity,
3940
postgres_default_fields,
41+
remove_invalid_json,
4042
)
4143
from posthog.temporal.batch_exports.spmc import (
4244
Producer,
@@ -186,10 +188,21 @@ async def assert_clickhouse_records_in_postgres(
186188
# bq_ingested_timestamp cannot be compared as it comes from an unstable function.
187189
continue
188190

191+
# Remove \u0000 from strings and bytes (we perform the same operation in the COPY query)
189192
if isinstance(v, str):
190-
v = v.replace("\\u0000", "")
193+
v = re.sub(r"(?<!\\)\\u0000", "", v)
191194
elif isinstance(v, bytes):
192-
v = v.replace(b"\\u0000", b"")
195+
v = re.sub(rb"(?<!\\)\\u0000", b"", v)
196+
# We remove unpaired surrogates in PostgreSQL, so we have to remove them here too so
197+
# that comparison doesn't fail. The problem is that at some point our unpaired surrogate gets
198+
# escaped (which is correct, as unpaired surrogates are not valid). But then the
199+
# comparison fails as in PostgreSQL we remove unpaired surrogates, not just escape them.
200+
# So, we hardcode replace the test properties. Not ideal, but this works as we get the
201+
# expected result in PostgreSQL and the comparison is still useful.
202+
if isinstance(v, str):
203+
v = v.replace("\\ud83e\\udd23\\udd23", "\\ud83e\\udd23").replace(
204+
"\\ud83e\\udd23\\ud83e", "\\ud83e\\udd23"
205+
)
193206

194207
if k in {"properties", "set", "set_once", "person_properties", "elements"} and v is not None:
195208
expected_record[k] = json.loads(v)
@@ -219,8 +232,18 @@ async def assert_clickhouse_records_in_postgres(
219232

220233
@pytest.fixture
221234
def test_properties(request, session_id):
222-
"""Include a \u0000 unicode escape sequence in properties."""
223-
return {"$browser": "Chrome", "$os": "Mac OS X", "unicode": "\u0000", "$session_id": session_id}
235+
"""Include some problematic properties."""
236+
try:
237+
return request.param
238+
except AttributeError:
239+
return {
240+
"$browser": "Chrome",
241+
"$os": "Mac OS X",
242+
"$session_id": session_id,
243+
"unicode_null": "\u0000",
244+
"emoji": "🤣",
245+
"newline": "\n",
246+
}
224247

225248

226249
@pytest.fixture
@@ -366,6 +389,76 @@ async def test_insert_into_postgres_activity_inserts_data_into_postgres_table(
366389
)
367390

368391

392+
@pytest.mark.parametrize("exclude_events", [None], indirect=True)
393+
@pytest.mark.parametrize("model", [BatchExportModel(name="events", schema=None)])
394+
@pytest.mark.parametrize(
395+
"test_properties",
396+
[
397+
{
398+
"$browser": "Chrome",
399+
"$os": "Mac OS X",
400+
"emoji": "🤣",
401+
"newline": "\n",
402+
"unicode_null": "\u0000",
403+
"invalid_unicode": "\\u0000'", # this has given us issues in the past
404+
"emoji_with_high_surrogate": "🤣\ud83e",
405+
"emoji_with_low_surrogate": "🤣\udd23",
406+
"emoji_with_high_surrogate_and_newline": "🤣\ud83e\n",
407+
"emoji_with_low_surrogate_and_newline": "🤣\udd23\n",
408+
}
409+
],
410+
indirect=True,
411+
)
412+
async def test_insert_into_postgres_activity_handles_problematic_json(
413+
clickhouse_client,
414+
activity_environment,
415+
postgres_connection,
416+
postgres_config,
417+
exclude_events,
418+
model: BatchExportModel,
419+
generate_test_data,
420+
data_interval_start,
421+
data_interval_end,
422+
ateam,
423+
):
424+
"""Sometimes users send us invalid JSON. We want to test that we handle this gracefully.
425+
426+
We only use the event model here since custom models with expressions such as JSONExtractString will still fail, as
427+
ClickHouse is not able to parse invalid JSON. There's not much we can do about this case.
428+
"""
429+
430+
batch_export_schema: BatchExportSchema | None = None
431+
batch_export_model = model
432+
433+
insert_inputs = PostgresInsertInputs(
434+
team_id=ateam.pk,
435+
table_name="test_table",
436+
data_interval_start=data_interval_start.isoformat(),
437+
data_interval_end=data_interval_end.isoformat(),
438+
exclude_events=exclude_events,
439+
batch_export_schema=batch_export_schema,
440+
batch_export_model=batch_export_model,
441+
**postgres_config,
442+
)
443+
444+
with override_settings(BATCH_EXPORT_POSTGRES_UPLOAD_CHUNK_SIZE_BYTES=5 * 1024**2):
445+
await activity_environment.run(insert_into_postgres_activity, insert_inputs)
446+
447+
sort_key = "event"
448+
await assert_clickhouse_records_in_postgres(
449+
postgres_connection=postgres_connection,
450+
clickhouse_client=clickhouse_client,
451+
schema_name=postgres_config["schema"],
452+
table_name="test_table",
453+
team_id=ateam.pk,
454+
data_interval_start=data_interval_start,
455+
data_interval_end=data_interval_end,
456+
batch_export_model=model,
457+
exclude_events=exclude_events,
458+
sort_key=sort_key,
459+
)
460+
461+
369462
async def test_insert_into_postgres_activity_merges_persons_data_in_follow_up_runs(
370463
clickhouse_client,
371464
activity_environment,
@@ -1342,3 +1435,22 @@ def track_heartbeat_details(*details):
13421435
expect_duplicates=True,
13431436
primary_key=["uuid"] if model.name == "events" else ["distinct_id", "person_id"],
13441437
)
1438+
1439+
1440+
@pytest.mark.parametrize(
1441+
"input_data, expected_data",
1442+
[
1443+
(b"Hello \uD83D\uDE00 World", b"Hello \uD83D\uDE00 World"), # Valid emoji pair (😀)
1444+
(b"Bad \uD800 unpaired high", b"Bad unpaired high"), # Unpaired high surrogate
1445+
(b"Bad \uDC00 unpaired low", b"Bad unpaired low"), # Unpaired low surrogate
1446+
(
1447+
b"\uD83C\uDF89 Party \uD800 \uD83D\uDE0A mixed",
1448+
b"\uD83C\uDF89 Party \uD83D\uDE0A mixed",
1449+
), # Mix of valid pairs and unpaired
1450+
(b"Hello \u0000 World", b"Hello World"), # \u0000 is not a valid JSON character in PostgreSQL
1451+
(b"Hello \\u0000 World", b"Hello World"), # this is the same as the above
1452+
(b"Hello \\\\u0000 World", b"Hello \\\\u0000 World"), # \\u0000 is escaped
1453+
],
1454+
)
1455+
def test_remove_invalid_json(input_data, expected_data):
1456+
assert remove_invalid_json(input_data) == expected_data

0 commit comments

Comments
 (0)